test_sampler/
stat_tests.rs

1//! Collection of statistical tests
2//!
3//! Provides a number of statistical tests that can be used to compare samples:
4//!  - Kolmogorov-Smirnov test
5//!  - Kuiper's test
6//!  - Anderson-Darling test
7//!
8//! Each of this tests, checks a null hypothesis that both samples are drawn
9//! from the same distribution. After the test is performed we can compare the
10//! p-value against some error threshold e.g. `5%` :
11//! ```
12//! # use test_sampler::stat_tests::*;
13//! # use rand::prelude::*;
14//! # fn main() -> Result<(), TestError> {
15//! let s1 : Vec<f64> = (0..100).map(|_| rand::thread_rng().gen()).collect();
16//! let s2 : Vec<f64> = (0..70).map(|_| rand::thread_rng().gen()).collect();
17//!
18//! let test_result = ks2_test(s1, s2)?;
19//!
20//! // For the test to reject null hypothesis p_value must be below the threshold
21//! assert!(test_result.p_value() > 0.05);
22//!
23//! # Ok(())}
24//! ```
25//! This basically tells that the value of the test statistic is within `95%`
26//! probability range for the distribution which assumes the null hypothesis that
27//! the samples are from the same distribution. So if we were to run the same
28//! test multiple times, we would find that in `95%` cases the test would not
29//! reject the null hypothesis and `5%` of times it would wrongly reject it,
30//! creating so-called *Type 1* error.
31//!
32//! However, what we are interested in is the frequency of the *Type 2* errors.
33//! They happen when the test does not reject the null hypothesis despite samples
34//! being drawn from different distributions. However, there is no analytical
35//! formula for this, sine the error frequency depends on the difference between
36//! the distributions and size of the samples. The parameter that describes the
37//! frequency of *Type 2* errors is called *Power* of the test and has to be
38//! established by numerical experiments.
39//!
40use std::f64::consts::PI;
41use std::iter::{Iterator, Peekable};
42use thiserror::Error;
43
44///
45/// Error that can be raised by a statistical test
46///
47#[derive(Debug, Error)]
48pub enum TestError {
49    /// Case when a type to test supports only a partial order and some of the entries are not
50    /// present in the order sequence (e.g. NaN in floats)
51    #[error("Collection contains values that cannot be placed in an order sequence (e.g. NaN for floats)")]
52    ContainsNotSortableValues,
53}
54
55/// Empirical cumulative distribution function
56///
57/// Is calculated by sorting the list of samples and represents step-like
58/// approximation to cdf of underling distribution.
59/// The value of ecdf for some `x ∈ [xᵢ, xᵢ₊₁)` is `i/N` where `N` is the total
60/// number of samples.
61///
62/// One can iterate over the ordered samples of ecdf and their associated values
63/// as below:
64/// ```
65/// # use test_sampler::stat_tests::{Ecdf, TestError};
66/// # fn main() -> Result<(), TestError> {
67/// let ecdf = Ecdf::new(vec![0.1, 0.0, 0.7 ,0.2])?;
68///
69/// for (ecdf_value, s) in &ecdf {
70///     println!("{s} {ecdf_value}")
71/// }
72/// # Ok(())}
73/// ```
74///
75#[derive(Debug, Clone)]
76pub struct Ecdf<T>
77where
78    T: PartialOrd + Copy,
79{
80    samples: Vec<T>,
81}
82
83impl<T> Ecdf<T>
84where
85    T: PartialOrd + Copy,
86{
87    /// Create a new instance from unordered vector of samples
88    ///
89    pub fn new(mut samples: Vec<T>) -> Result<Self, TestError> {
90        if !all_comparable(&samples) {
91            return Err(TestError::ContainsNotSortableValues);
92        }
93        samples.sort_by(|a, b| a.partial_cmp(b).expect("Not comparable values in the grid"));
94        Ok(Self { samples })
95    }
96
97    /// Get the value of ecdf at `val`
98    ///
99    pub fn get(&self, val: T) -> f64 {
100        let idx = self.samples.partition_point(|x| *x <= val);
101        idx as f64 / self.samples.len() as f64
102    }
103}
104
105/// Iterator over ecdf
106///
107/// Iterates over pairs (ecdf_value, sample) obtained from [Ecdf].
108///
109#[derive(Debug, Clone)]
110pub struct EcdfIterator<'a, T>
111where
112    T: PartialOrd + Copy,
113{
114    ecdf: &'a Ecdf<T>,
115    idx: usize,
116    n: usize,
117}
118
119impl<'a, T> EcdfIterator<'a, T>
120where
121    T: PartialOrd + Copy,
122{
123    fn new(ecdf: &'a Ecdf<T>) -> Self {
124        Self {
125            ecdf,
126            idx: 0,
127            n: ecdf.samples.len(),
128        }
129    }
130}
131
132impl<'a, T> Iterator for EcdfIterator<'a, T>
133where
134    T: PartialOrd + Copy,
135{
136    type Item = (f64, T);
137    fn next(&mut self) -> Option<Self::Item> {
138        if let Some(x) = self.ecdf.samples.get(self.idx) {
139            self.idx += 1;
140            Some((self.idx as f64 / self.n as f64, *x))
141        } else {
142            None
143        }
144    }
145}
146
147impl<'a, T> IntoIterator for &'a Ecdf<T>
148where
149    T: PartialOrd + Copy,
150{
151    type Item = (f64, T);
152    type IntoIter = EcdfIterator<'a, T>;
153
154    fn into_iter(self) -> Self::IntoIter {
155        Self::IntoIter::new(self)
156    }
157}
158
159///
160/// Check that all elements in the grid are in the partial order
161///
162/// Note that e.g. f64 may contain NaNs which are not comparable to other numbers.
163///
164fn all_comparable<T>(grid: &[T]) -> bool
165where
166    T: PartialOrd,
167{
168    return grid.windows(2).all(|x| x[0].partial_cmp(&x[1]).is_some());
169}
170
171///
172/// Compute the approximate value of the CDF of Anderson-Darling distribution
173///
174/// Uses the approximate expression from the (Marsaglia 2004) paper. Taken
175/// from the C code
176///
177fn anderson_darling_cdf(z: f64) -> f64 {
178    if z == 0.0 {
179        0.0
180    } else if z < 2.0 {
181        f64::exp(-1.2337141 / z) / f64::sqrt(z)
182            * (2.00012
183                + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * z) * z) * z) * z)
184                    * z)
185    } else {
186        f64::exp(-f64::exp(
187            1.0776
188                - (2.30695 - (0.43424 - (0.082433 - (0.008056 - 0.0003146 * z) * z) * z) * z) * z,
189        ))
190    }
191}
192
193/// Represents a result of a statistical test
194///
195/// This struct is returned by a statistical tests and contains:
196/// - value of the test statistic
197/// - p_value for the statistic and the provided effective population size
198///
199/// At the moment test results does not store information about the test
200/// in which it was produced.
201///
202#[derive(Debug, Clone, Copy, PartialEq)]
203pub struct TestResult {
204    stat: f64,
205    p: f64,
206    n: f64,
207}
208
209impl TestResult {
210    /// Compute complement of Kolmogorov-Smirnov Cumulative Distribution Function
211    ///
212    /// Computes: `Q(z) = 1 - CDF(z)`
213    ///
214    /// Implementation is based on the power series definitions from
215    /// "Numerical Recipes" by Press et al. (2007)
216    /// We select the same selection criteria for each of the two series and keep
217    /// the same number of terms
218    ///
219    /// # Panics
220    /// If the value of the statistic is outside the support
221    ///
222    fn complement_ks_cdf(z: f64) -> f64 {
223        if z < 0.0 {
224            panic!("Value of test statistic outside the support");
225        } else if z == 0.0 {
226            1.0
227        } else if z < 1.18 {
228            let factor = f64::sqrt(2.0 * PI) / z;
229            let term = f64::exp(-PI * PI / 8. / (z * z));
230            1.0 - factor * (term + term.powi(9) + term.powi(25) + term.powi(49))
231        } else {
232            let term = f64::exp(-2.0 * z * z);
233            2.0 * (term - term.powi(4) + term.powi(9))
234        }
235    }
236
237    ///
238    /// Create a new instance of the KS test result
239    ///
240    /// # Arguments
241    /// - `stat` - Value of the test statistic
242    /// - `n` - effective sample size
243    ///
244    fn new_ks(stat: f64, n: f64) -> Self {
245        let sqrt_n = f64::sqrt(n);
246        let arg = sqrt_n + 0.12 + 0.11 / sqrt_n;
247        let p = Self::complement_ks_cdf(arg * stat);
248        Self { stat, n, p }
249    }
250
251    ///
252    /// Create a new instance of the result of Kuiper test result
253    ///
254    /// # Arguments
255    /// - `stat` - Value of the test statistic
256    /// - `n` - effective sample size
257    ///
258    fn new_kuiper(stat: f64, n: f64) -> Self {
259        let z = n.sqrt() * stat;
260        let mut p = 0.0;
261
262        // In case the z parameter is very small the sum may become unstable.
263        // Set p to 1.0 in that case (which is a decent approximation)
264        if z < KuiperTerms::Z_MIN {
265            return Self { stat, n, p: 1.0 };
266        }
267
268        // Limit the maximum number of terms to 200
269        for term in KuiperTerms::new(z, n).take(200) {
270            let p_old = p;
271            p += term;
272            if f64::abs(p / p_old - 1.0) < 1e-7 {
273                break;
274            }
275        }
276        Self { stat, n, p }
277    }
278
279    ///
280    /// Create a new instance of the result of Anderson-Darling two-sample test
281    ///
282    /// # Arguments
283    /// - `stat` - Value of the test statistic
284    /// - `n` - effective sample size
285    ///
286    /// We use the approximation of the CDF by (Lewis 1961)
287    ///
288    /// $$ F(z) = 1 - \sqrt{ 1 - 4 \exp(-(z+1)) } $$
289    ///
290    /// Or if the value under square root is negative: $F(z)= 1$
291    /// This approximation should be overly conservative for the small samples.
292    ///
293    fn new_ad(stat: f64, n: f64) -> Self {
294        let p = 1.0 - anderson_darling_cdf(stat);
295        Self { stat, n, p }
296    }
297
298    /// Get the p-value of the test
299    ///
300    /// Probability of observing the data under the assumption that null hypothesis holds
301    ///
302    pub fn p_value(&self) -> f64 {
303        self.p
304    }
305
306    /// Get the value of the test statistic
307    ///
308    pub fn stat(&self) -> f64 {
309        self.stat
310    }
311}
312
313///
314/// Iterator over the terms of Kuiper asymptotic formula for distribution of the test statistic.
315///
316/// The series is derived in the original paper (Kuiper 1960) and its sum should give
317/// the complement of cumulative distribution function for large sample sizes.
318///
319/// However for the small threshold parameter z the summation can become unstable, thus we
320/// set the limit for the z for which the terms can be evaluated.
321///
322struct KuiperTerms {
323    z: f64,
324    n: f64,
325    i: usize,
326}
327
328impl KuiperTerms {
329    /// Stability threshold, below the summation may become unstable
330    pub const Z_MIN: f64 = 0.1;
331
332    /// Create new sequence of Kuiper terms
333    ///
334    /// # Arguments
335    /// - `z` : value of the threshold parameter P( √n * V > z) where `V` is the Kuiper statistic.
336    /// - `n` : effective population of the sample
337    ///
338    /// # Panics
339    /// If `z <= Self::Z_MIN` or `n` is not positive.
340    ///
341    fn new(z: f64, n: f64) -> Self {
342        if z <= Self::Z_MIN {
343            panic!("Value of the test statistic is too small {z} for the series to converge");
344        } else if n <= 0.0 {
345            panic!("Effective population {n} must be +ve value");
346        }
347        Self { z, n, i: 0 }
348    }
349}
350
351impl Iterator for KuiperTerms {
352    type Item = f64;
353
354    fn next(&mut self) -> Option<Self::Item> {
355        self.i += 1;
356        let i = self.i as f64;
357        let zi_sq = (self.z * i).powi(2);
358        let exp_term = f64::exp(-2. * zi_sq);
359        let factor = 8.0 * self.z / (3. * self.n.sqrt());
360        let term = (2. * (4. * zi_sq - 1.) + factor * i.powi(2) * (4. * zi_sq - 3.)) * exp_term;
361        Some(term)
362    }
363}
364
365///
366/// Builds a Brownian bridge of two [Ecdf]s
367///
368/// The `top` ecdf gives positive contributions, `bottom` negative.
369/// We iterate through both sorted sets of samples in-order and depending
370/// whether a sample from the `top` or `bottom` ecdf was taken we nudge the
371/// sum up or down respectively.
372///
373/// We skip over the repeated values. Thus for samples:
374///   [1, 2, 2, 3]
375///   [2]
376/// We get the sequence:
377///   [0.25, -0.25, 0.0]
378///
379struct EcdfBridge<'a, T>
380where
381    T: PartialOrd + Copy,
382{
383    top: Peekable<std::slice::Iter<'a, T>>,
384    bottom: Peekable<std::slice::Iter<'a, T>>,
385    delta_top: f64,
386    delta_bottom: f64,
387    sum: f64,
388    count: usize,
389}
390
391impl<'a, T> EcdfBridge<'a, T>
392where
393    T: PartialOrd + Copy,
394{
395    pub fn new(ecdf_top: &'a Ecdf<T>, ecdf_bottom: &'a Ecdf<T>) -> Self {
396        Self {
397            top: ecdf_top.samples.iter().peekable(),
398            bottom: ecdf_bottom.samples.iter().peekable(),
399            delta_top: 1.0 / ecdf_top.samples.len() as f64,
400            delta_bottom: 1.0 / ecdf_bottom.samples.len() as f64,
401            sum: 0.0,
402            count: 0,
403        }
404    }
405
406    /// Advance the top ecdf
407    ///
408    /// Needs to loop over repeated values.
409    ///
410    /// # Panics
411    /// If it is called when `top` is already empty
412    ///
413    fn advance_top(&mut self) -> f64 {
414        loop {
415            let v = self.top.next().unwrap();
416            self.count += 1;
417            self.sum += self.delta_top;
418
419            let next = self.top.peek();
420
421            if next.is_none() || *next.unwrap() != v {
422                break;
423            }
424        }
425        self.sum
426    }
427
428    /// Advance the bottom ecdf
429    ///
430    /// Needs to loop over repeated values.
431    ///
432    /// # Panics
433    /// If it is called when `bottom` is already empty
434    ///
435    fn advance_bottom(&mut self) -> f64 {
436        loop {
437            let v = self.bottom.next().unwrap();
438            self.count += 1;
439            self.sum -= self.delta_bottom;
440
441            let next = self.bottom.peek();
442
443            if next.is_none() || *next.unwrap() != v {
444                break;
445            }
446        }
447        self.sum
448    }
449
450    /// Enumerate the samples of the bridge
451    ///
452    /// Iterates over `(i, val)` where `i` is the count of all samples. Like
453    /// [EcdfBridge] iterator skips over repeated samples, but they are included
454    /// in the count. Thus the increments of `i` may be larger than `1`.
455    ///
456    pub fn enumerate_samples(self) -> EnumeratedEcdfBridge<'a, T> {
457        EnumeratedEcdfBridge { bridge: self }
458    }
459}
460
461impl<'a, T> Iterator for EcdfBridge<'a, T>
462where
463    T: PartialOrd + Copy,
464{
465    type Item = f64;
466
467    fn next(&mut self) -> Option<Self::Item> {
468        let top = self.top.peek();
469        let bottom = self.bottom.peek();
470
471        match (top, bottom) {
472            (Some(t), Some(b)) => {
473                if t < b {
474                    Some(self.advance_top())
475                } else if t == b {
476                    // We need to be careful if same element is present in both `top` and `bottom`.
477                    // In that case we need to advance both iterators
478                    self.advance_top();
479                    Some(self.advance_bottom())
480                } else {
481                    Some(self.advance_bottom())
482                }
483            }
484            (None, Some(_)) => Some(self.advance_bottom()),
485            (Some(_), None) => Some(self.advance_top()),
486            (None, None) => None,
487        }
488    }
489}
490
491///
492/// Returned from [EcdfBridge::enumerate_samples]
493///
494/// Wraps around [EcdfBridge] and allows to enumerate the samples, but
495/// unlike the ordinary [std::iter::Iterator::enumerate] accounts for the
496/// 'skips' (non 1 increment) due to the repeated samples.
497///
498struct EnumeratedEcdfBridge<'a, T>
499where
500    T: PartialOrd + Copy,
501{
502    bridge: EcdfBridge<'a, T>,
503}
504
505impl<'a, T> Iterator for EnumeratedEcdfBridge<'a, T>
506where
507    T: PartialOrd + Copy,
508{
509    type Item = (usize, f64);
510
511    fn next(&mut self) -> Option<Self::Item> {
512        match self.bridge.next() {
513            Some(v) => Some((self.bridge.count - 1, v)),
514            None => None,
515        }
516    }
517}
518
519///
520/// Perform one sample Kolmogorov-Smirnov statistical test
521///
522/// Evaluates the Kolmogorov-Smirnov statistic:
523///
524/// $$ KS = \max |F_1(x) - F_{ref}(x)| $$
525///
526/// Where $F_1$ is the empirical cumulative distribution function of the sample
527/// and the $F_{ref}$ the reference distribution provided by the user as an argument.
528/// The p-value is calculated from the statistic as prescribed in (Press 2007).
529///
530/// # References
531/// - Press W. H. , Teukolsky S. A., Vetterling W. T., Flannery B. P. (2007).
532///   Numerical Recipes 3rd Edition: The Art of Scientific Computing (3rd. ed.).
533///   Cambridge University Press, USA.
534///
535pub fn ks1_test<T>(cdf: impl Fn(&T) -> f64, samples: Vec<T>) -> Result<TestResult, TestError>
536where
537    T: PartialOrd + Copy,
538{
539    let n = samples.len();
540    let ecdf = Ecdf::new(samples)?;
541
542    let mut stat = 0.0;
543
544    for (ecdf_value, v) in &ecdf {
545        let new_stat = (cdf(&v) - ecdf_value).abs();
546        if new_stat > stat {
547            stat = new_stat;
548        }
549    }
550
551    Ok(TestResult::new_ks(stat, n as f64))
552}
553
554///
555/// Preform two sample Kolmogorov-Smirnov test
556///
557/// Evaluates the Kolmogorov-Smirnov statistic:
558///
559/// $$ KS = \max |F_1(x) - F_2(x)| $$
560///
561/// Where both $F_1$ and $F_2$ are empirical cumulative distribution functions.
562/// The p-value is calculated from the statistic as prescribed in (Press 2007)
563///
564/// # References
565/// - Press W. H. , Teukolsky S. A., Vetterling W. T., Flannery B. P. (2007).
566///   Numerical Recipes 3rd Edition: The Art of Scientific Computing (3rd. ed.).
567///   Cambridge University Press, USA.
568///
569pub fn ks2_test<T>(sample1: Vec<T>, sample2: Vec<T>) -> Result<TestResult, TestError>
570where
571    T: PartialOrd + Copy,
572{
573    let n1 = sample1.len() as f64;
574    let n2 = sample2.len() as f64;
575
576    let ecdf1 = Ecdf::new(sample1)?;
577    let ecdf2 = Ecdf::new(sample2)?;
578
579    // Compute statistic
580    let stat = EcdfBridge::new(&ecdf1, &ecdf2)
581        .map(|v| v.abs())
582        .max_by(|a, b| a.partial_cmp(b).expect("PartialOrd comparison failed"))
583        .expect("Empty iterator?!");
584
585    Ok(TestResult::new_ks(stat, n1 * n2 / (n1 + n2)))
586}
587
588///
589/// Preform 2-sample Kuiper's test
590///
591/// Kuiper's test is the modification of the Kolmogorov-Smirnov test. Instead
592/// of searching for maximum deviation between empirical cdfs, it uses the
593/// following as the test statistic:
594/// $$ K = \max{[F_1(x) - F_2(x)]} + \min{[F_1(x) - F_2(x)]} $$
595///
596/// Where $F_i(x)$ is of course empirical cdf for the i-th sample.
597///
598/// The particularly interesting feature of the Kuiper's test is, that if the
599/// support is a circle, the test statistic is invariant under rotation.
600/// Thus it is used particularly often for angular distributions.
601/// It is also said to be more sensitive 'at the tails' of the distribution than
602/// KS test.
603///
604/// # References
605/// - Kuiper, N.H. (1960). Tests concerning random points on a circle.
606///   Indagationes Mathematicae (Proceedings), 63, 38-47.
607///   <https://doi.org/10.1016/S1385-7258(60)50006-0>
608///
609///
610pub fn kuiper2_test<T>(sample1: Vec<T>, sample2: Vec<T>) -> Result<TestResult, TestError>
611where
612    T: PartialOrd + Copy,
613{
614    let n1 = sample1.len() as f64;
615    let n2 = sample2.len() as f64;
616
617    let ecdf1 = Ecdf::new(sample1)?;
618    let ecdf2 = Ecdf::new(sample2)?;
619
620    let mut minimum = 0.0;
621    let mut maximum = 0.0;
622
623    for v in EcdfBridge::new(&ecdf1, &ecdf2) {
624        if v < minimum {
625            minimum = v
626        } else if v > maximum {
627            maximum = v
628        }
629    }
630    let stat = minimum.abs() + maximum.abs();
631
632    Ok(TestResult::new_kuiper(stat, n1 * n2 / (n1 + n2)))
633}
634
635///
636/// Perform Anderson-Darling two sample test
637///
638/// Should only be used with continuous distributions and large samples!
639/// (More than few 100s).
640///
641/// Implementation is based on the (Pettitt 1976) paper. As the result can be
642/// used only for continuous distributions where the duplicate values in the
643/// samples have vanishingly small probability to occur.
644///
645/// Note that the (Pettitt 1976) seems to have slightly different behaviour
646/// from the $ A^2\_{kN} $ statistic of (Scholz 1987) if the duplicate samples
647/// are present.
648///
649/// The p-value of the test is obtained using the simplified expression of
650/// (Marsaglia 2004) [see the C code distributed with the paper]. The p-value
651/// calculation uses large-sample approximation and may overestimate the
652/// distribution for smaller sample sizes.
653///
654/// # References
655/// - Pettitt, A. N. (1976). A Two-Sample Anderson--Darling Rank Statistic. Biometrika, 63(1),
656///   161–168. <https://doi.org/10.2307/2335097>
657/// - Scholz, F. W., & Stephens, M. A. (1987). K-Sample Anderson-Darling Tests.
658///   Journal of the American Statistical Association, 82(399), 918–924.
659///   <https://doi.org/10.2307/2288805>
660/// - Marsaglia, G., & Marsaglia, J. (2004). Evaluating the Anderson-Darling
661///   Distribution. Journal of Statistical Software, 9(2), 1–5.
662///   <https://doi.org/10.18637/jss.v009.i02>
663///
664pub fn ad2_test<T>(sample1: Vec<T>, sample2: Vec<T>) -> Result<TestResult, TestError>
665where
666    T: PartialOrd + Copy,
667{
668    let n1 = sample1.len() as f64;
669    let n2 = sample2.len() as f64;
670    let tot_n = sample1.len() + sample2.len();
671
672    let ecdf1 = Ecdf::new(sample1)?;
673    let ecdf2 = Ecdf::new(sample2)?;
674
675    let mut stat = 0.0;
676
677    for (idx, v) in EcdfBridge::new(&ecdf1, &ecdf2).enumerate_samples() {
678        let denum = (idx + 1) * (tot_n - idx - 1);
679
680        // We need to exclude last entry which will be a NaN [0./0.]
681        if denum != 0 {
682            stat += v.powi(2) / denum as f64
683        }
684    }
685
686    // Scale by the population
687    stat *= n1 * n2;
688
689    Ok(TestResult::new_ad(stat, tot_n as f64))
690}
691
692#[cfg(test)]
693mod tests {
694    use super::*;
695
696    #[test]
697    fn test_ad_cdf() {
698        let val = vec![0.0, 0.3, 2.0, 3.0];
699        let ref_vals = vec![0.0, 0.06183989, 0.90816425, 0.97263617];
700
701        let rhs = val
702            .iter()
703            .map(|x| anderson_darling_cdf(*x))
704            .collect::<Vec<f64>>();
705
706        for (l, r) in std::iter::zip(ref_vals, rhs) {
707            approx::assert_relative_eq!(l, r, max_relative = 1.0e-6);
708        }
709    }
710
711    #[test]
712    fn test_ecdf() {
713        let samples = [0.3, 0.1, 0.5, 0.7];
714        let ecdf = Ecdf::new(samples.into()).unwrap();
715
716        assert_eq!(0.0, ecdf.get(-2.0));
717
718        assert_eq!(0.25, ecdf.get(0.1));
719        assert_eq!(0.25, ecdf.get(0.2));
720
721        assert_eq!(0.75, ecdf.get(0.5));
722        assert_eq!(0.75, ecdf.get(0.6));
723
724        assert_eq!(1.0, ecdf.get(0.7));
725        assert_eq!(1.0, ecdf.get(0.71));
726    }
727
728    #[test]
729    fn test_ks1_test() {
730        let samples = [0.3, 0.2, 0.25, 0.1, 0.9, 0.6];
731        let lhs = TestResult::new_ks(4.0 / 6.0 - 0.3, 6.0);
732        let rhs = ks1_test(|x| *x, samples.into()).unwrap();
733        assert_eq!(lhs, rhs);
734    }
735
736    #[test]
737    fn test_ks1_error() {
738        let samples = [0.3, 0.2, 0.25, 0.1, 0.9, 0.6, f64::NAN];
739        assert!(
740            ks1_test(|x| *x, samples.into()).is_err(),
741            "Failed to detect nan in the list"
742        )
743    }
744
745    #[test]
746    fn test_ks2_test() {
747        let samples1 = [0.3, 0.2, 0.25, 0.1, 0.9, 0.6];
748        let samples2 = [0.1, 0.8, 0.34, 0.09, 0.12, 0.81];
749
750        let rhs = ks2_test(samples1.into(), samples2.into()).unwrap();
751        let lhs = TestResult::new_ks(1.0 / 3.0, 3.0);
752        assert_eq!(lhs, rhs);
753    }
754
755    #[test]
756    fn test_ks2_test_repeated_samples() {
757        let samples1 = [1, 2, 2, 3];
758        let samples2 = [2];
759
760        let rhs = ks2_test(samples1.into(), samples2.into()).unwrap();
761        let lhs = TestResult::new_ks(0.25, 0.8);
762        assert_eq!(lhs, rhs);
763    }
764
765    #[test]
766    fn test_kuiper() {
767        let sample1 = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
768        let sample2 = vec![11, 2, 3, 4, 5, 6, 7, 8, 9, 10];
769
770        let rhs = kuiper2_test(sample1, sample2).unwrap();
771        let lhs = TestResult::new_kuiper(0.1, 5.0);
772
773        println!("{rhs:#?}");
774        assert_eq!(lhs, rhs);
775    }
776
777    #[test]
778    fn test_kuiper_same_sample() {
779        let sample1 = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
780
781        let rhs = kuiper2_test(sample1.clone(), sample1.clone()).unwrap();
782        let lhs = TestResult::new_kuiper(0.0, 5.0);
783
784        println!("{rhs:#?}");
785        assert_eq!(lhs, rhs);
786    }
787
788    #[test]
789    fn test_kuiper_2() {
790        let sample1 = vec![0.07, 0.74, 0.20, 0.55, 0.33, 0.98, 0.32, 0.36, 0.86, 0.43];
791        let sample2 = vec![0.73, 0.10, 0.49, 0.18, 0.87, 0.25, 0.80, 0.54, 0.90, 0.06];
792
793        let rhs = kuiper2_test(sample1, sample2).unwrap();
794        let lhs = TestResult::new_kuiper(0.4, 5.0);
795
796        println!("{rhs:#?}");
797        assert_eq!(lhs, rhs);
798    }
799
800    #[test]
801    fn test_ad() {
802        let sample1 = vec![0.07, 0.74, 0.20, 0.55, 0.33, 0.98, 0.32, 0.36, 0.86, 0.43];
803        let sample2 = vec![0.73, 0.10, 0.49, 0.18, 0.87, 0.25, 0.80, 0.54, 0.90, 0.06];
804
805        let rhs = ad2_test(sample1, sample2).unwrap();
806
807        // Reference value taken from SciPy
808        // (Internal `_anderson_ksamp_right` was used to get statistic before normalisation)
809        let lhs = TestResult::new_ad(0.3634446006350032, 20.0);
810
811        println!("{rhs:#?}");
812        assert_eq!(lhs, rhs);
813    }
814
815    #[test]
816    fn test_ecdf_bridge() {
817        let cases = [
818            (vec![1, 2, 2, 3], vec![2], vec![0.25, -0.25, 0.0]),
819            (vec![2], vec![1, 2, 2, 3], vec![-0.25, 0.25, 0.0]),
820            (vec![1, 2, 2, 3], vec![1, 2, 2, 3], vec![0.0, 0.0, 0.0]),
821            (vec![1, 2, 3, 4], vec![3, 3], vec![0.25, 0.5, -0.25, 0.0]),
822            (vec![2], vec![1, 2, 2, 2], vec![-0.25, 0.0]),
823        ];
824
825        for (s1, s2, reference) in cases {
826            let ecdf1 = Ecdf::new(s1).unwrap();
827            let ecdf2 = Ecdf::new(s2).unwrap();
828
829            let bridge = EcdfBridge::new(&ecdf1, &ecdf2).collect::<Vec<_>>();
830
831            assert_eq!(bridge, reference);
832        }
833    }
834
835    #[test]
836    fn test_ecdf_enumerated_bridge() {
837        let sample1 = vec![1, 2, 2, 3, 3];
838        let sample2 = vec![0, 2, 4];
839
840        let ecdf1 = Ecdf::new(sample1).unwrap();
841        let ecdf2 = Ecdf::new(sample2).unwrap();
842
843        let ref_count: Vec<usize> = vec![0, 1, 4, 6, 7];
844
845        let count = EcdfBridge::new(&ecdf1, &ecdf2)
846            .enumerate_samples()
847            .map(|(i, _)| i)
848            .collect::<Vec<usize>>();
849
850        assert_eq!(ref_count, count);
851    }
852
853    #[test]
854    fn test_ks_cdf_complement() {
855        // Approximate reference points obtained from SciPy
856        let test_points = [
857            (0.0, 1.0),
858            (1.0, 2.6999967168e-01),
859            (2.0, 6.7092525578e-04),
860            (3.0, 3.045996e-08),
861            (100.0, 0.0),
862        ];
863
864        for (x, val) in test_points {
865            approx::assert_relative_eq!(
866                val,
867                TestResult::complement_ks_cdf(x),
868                max_relative = 1.0e-7
869            );
870        }
871    }
872
873    #[test]
874    #[should_panic]
875    fn test_ks_cdf_complement_invalid_range() {
876        TestResult::complement_ks_cdf(-2.0);
877    }
878}
test_sampler/stat_tests.rs

test_sampler/
stat_tests.rs