test_sampler/stat_tests.rs
1//! Collection of statistical tests
2//!
3//! Provides a number of statistical tests that can be used to compare samples:
4//! - Kolmogorov-Smirnov test
5//! - Kuiper's test
6//! - Anderson-Darling test
7//!
8//! Each of this tests, checks a null hypothesis that both samples are drawn
9//! from the same distribution. After the test is performed we can compare the
10//! p-value against some error threshold e.g. `5%` :
11//! ```
12//! # use test_sampler::stat_tests::*;
13//! # use rand::prelude::*;
14//! # fn main() -> Result<(), TestError> {
15//! let s1 : Vec<f64> = (0..100).map(|_| rand::thread_rng().gen()).collect();
16//! let s2 : Vec<f64> = (0..70).map(|_| rand::thread_rng().gen()).collect();
17//!
18//! let test_result = ks2_test(s1, s2)?;
19//!
20//! // For the test to reject null hypothesis p_value must be below the threshold
21//! assert!(test_result.p_value() > 0.05);
22//!
23//! # Ok(())}
24//! ```
25//! This basically tells that the value of the test statistic is within `95%`
26//! probability range for the distribution which assumes the null hypothesis that
27//! the samples are from the same distribution. So if we were to run the same
28//! test multiple times, we would find that in `95%` cases the test would not
29//! reject the null hypothesis and `5%` of times it would wrongly reject it,
30//! creating so-called *Type 1* error.
31//!
32//! However, what we are interested in is the frequency of the *Type 2* errors.
33//! They happen when the test does not reject the null hypothesis despite samples
34//! being drawn from different distributions. However, there is no analytical
35//! formula for this, sine the error frequency depends on the difference between
36//! the distributions and size of the samples. The parameter that describes the
37//! frequency of *Type 2* errors is called *Power* of the test and has to be
38//! established by numerical experiments.
39//!
40use std::f64::consts::PI;
41use std::iter::{Iterator, Peekable};
42use thiserror::Error;
43
44///
45/// Error that can be raised by a statistical test
46///
47#[derive(Debug, Error)]
48pub enum TestError {
49 /// Case when a type to test supports only a partial order and some of the entries are not
50 /// present in the order sequence (e.g. NaN in floats)
51 #[error("Collection contains values that cannot be placed in an order sequence (e.g. NaN for floats)")]
52 ContainsNotSortableValues,
53}
54
55/// Empirical cumulative distribution function
56///
57/// Is calculated by sorting the list of samples and represents step-like
58/// approximation to cdf of underling distribution.
59/// The value of ecdf for some `x ∈ [xᵢ, xᵢ₊₁)` is `i/N` where `N` is the total
60/// number of samples.
61///
62/// One can iterate over the ordered samples of ecdf and their associated values
63/// as below:
64/// ```
65/// # use test_sampler::stat_tests::{Ecdf, TestError};
66/// # fn main() -> Result<(), TestError> {
67/// let ecdf = Ecdf::new(vec![0.1, 0.0, 0.7 ,0.2])?;
68///
69/// for (ecdf_value, s) in &ecdf {
70/// println!("{s} {ecdf_value}")
71/// }
72/// # Ok(())}
73/// ```
74///
75#[derive(Debug, Clone)]
76pub struct Ecdf<T>
77where
78 T: PartialOrd + Copy,
79{
80 samples: Vec<T>,
81}
82
83impl<T> Ecdf<T>
84where
85 T: PartialOrd + Copy,
86{
87 /// Create a new instance from unordered vector of samples
88 ///
89 pub fn new(mut samples: Vec<T>) -> Result<Self, TestError> {
90 if !all_comparable(&samples) {
91 return Err(TestError::ContainsNotSortableValues);
92 }
93 samples.sort_by(|a, b| a.partial_cmp(b).expect("Not comparable values in the grid"));
94 Ok(Self { samples })
95 }
96
97 /// Get the value of ecdf at `val`
98 ///
99 pub fn get(&self, val: T) -> f64 {
100 let idx = self.samples.partition_point(|x| *x <= val);
101 idx as f64 / self.samples.len() as f64
102 }
103}
104
105/// Iterator over ecdf
106///
107/// Iterates over pairs (ecdf_value, sample) obtained from [Ecdf].
108///
109#[derive(Debug, Clone)]
110pub struct EcdfIterator<'a, T>
111where
112 T: PartialOrd + Copy,
113{
114 ecdf: &'a Ecdf<T>,
115 idx: usize,
116 n: usize,
117}
118
119impl<'a, T> EcdfIterator<'a, T>
120where
121 T: PartialOrd + Copy,
122{
123 fn new(ecdf: &'a Ecdf<T>) -> Self {
124 Self {
125 ecdf,
126 idx: 0,
127 n: ecdf.samples.len(),
128 }
129 }
130}
131
132impl<'a, T> Iterator for EcdfIterator<'a, T>
133where
134 T: PartialOrd + Copy,
135{
136 type Item = (f64, T);
137 fn next(&mut self) -> Option<Self::Item> {
138 if let Some(x) = self.ecdf.samples.get(self.idx) {
139 self.idx += 1;
140 Some((self.idx as f64 / self.n as f64, *x))
141 } else {
142 None
143 }
144 }
145}
146
147impl<'a, T> IntoIterator for &'a Ecdf<T>
148where
149 T: PartialOrd + Copy,
150{
151 type Item = (f64, T);
152 type IntoIter = EcdfIterator<'a, T>;
153
154 fn into_iter(self) -> Self::IntoIter {
155 Self::IntoIter::new(self)
156 }
157}
158
159///
160/// Check that all elements in the grid are in the partial order
161///
162/// Note that e.g. f64 may contain NaNs which are not comparable to other numbers.
163///
164fn all_comparable<T>(grid: &[T]) -> bool
165where
166 T: PartialOrd,
167{
168 return grid.windows(2).all(|x| x[0].partial_cmp(&x[1]).is_some());
169}
170
171///
172/// Compute the approximate value of the CDF of Anderson-Darling distribution
173///
174/// Uses the approximate expression from the (Marsaglia 2004) paper. Taken
175/// from the C code
176///
177fn anderson_darling_cdf(z: f64) -> f64 {
178 if z == 0.0 {
179 0.0
180 } else if z < 2.0 {
181 f64::exp(-1.2337141 / z) / f64::sqrt(z)
182 * (2.00012
183 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * z) * z) * z) * z)
184 * z)
185 } else {
186 f64::exp(-f64::exp(
187 1.0776
188 - (2.30695 - (0.43424 - (0.082433 - (0.008056 - 0.0003146 * z) * z) * z) * z) * z,
189 ))
190 }
191}
192
193/// Represents a result of a statistical test
194///
195/// This struct is returned by a statistical tests and contains:
196/// - value of the test statistic
197/// - p_value for the statistic and the provided effective population size
198///
199/// At the moment test results does not store information about the test
200/// in which it was produced.
201///
202#[derive(Debug, Clone, Copy, PartialEq)]
203pub struct TestResult {
204 stat: f64,
205 p: f64,
206 n: f64,
207}
208
209impl TestResult {
210 /// Compute complement of Kolmogorov-Smirnov Cumulative Distribution Function
211 ///
212 /// Computes: `Q(z) = 1 - CDF(z)`
213 ///
214 /// Implementation is based on the power series definitions from
215 /// "Numerical Recipes" by Press et al. (2007)
216 /// We select the same selection criteria for each of the two series and keep
217 /// the same number of terms
218 ///
219 /// # Panics
220 /// If the value of the statistic is outside the support
221 ///
222 fn complement_ks_cdf(z: f64) -> f64 {
223 if z < 0.0 {
224 panic!("Value of test statistic outside the support");
225 } else if z == 0.0 {
226 1.0
227 } else if z < 1.18 {
228 let factor = f64::sqrt(2.0 * PI) / z;
229 let term = f64::exp(-PI * PI / 8. / (z * z));
230 1.0 - factor * (term + term.powi(9) + term.powi(25) + term.powi(49))
231 } else {
232 let term = f64::exp(-2.0 * z * z);
233 2.0 * (term - term.powi(4) + term.powi(9))
234 }
235 }
236
237 ///
238 /// Create a new instance of the KS test result
239 ///
240 /// # Arguments
241 /// - `stat` - Value of the test statistic
242 /// - `n` - effective sample size
243 ///
244 fn new_ks(stat: f64, n: f64) -> Self {
245 let sqrt_n = f64::sqrt(n);
246 let arg = sqrt_n + 0.12 + 0.11 / sqrt_n;
247 let p = Self::complement_ks_cdf(arg * stat);
248 Self { stat, n, p }
249 }
250
251 ///
252 /// Create a new instance of the result of Kuiper test result
253 ///
254 /// # Arguments
255 /// - `stat` - Value of the test statistic
256 /// - `n` - effective sample size
257 ///
258 fn new_kuiper(stat: f64, n: f64) -> Self {
259 let z = n.sqrt() * stat;
260 let mut p = 0.0;
261
262 // In case the z parameter is very small the sum may become unstable.
263 // Set p to 1.0 in that case (which is a decent approximation)
264 if z < KuiperTerms::Z_MIN {
265 return Self { stat, n, p: 1.0 };
266 }
267
268 // Limit the maximum number of terms to 200
269 for term in KuiperTerms::new(z, n).take(200) {
270 let p_old = p;
271 p += term;
272 if f64::abs(p / p_old - 1.0) < 1e-7 {
273 break;
274 }
275 }
276 Self { stat, n, p }
277 }
278
279 ///
280 /// Create a new instance of the result of Anderson-Darling two-sample test
281 ///
282 /// # Arguments
283 /// - `stat` - Value of the test statistic
284 /// - `n` - effective sample size
285 ///
286 /// We use the approximation of the CDF by (Lewis 1961)
287 ///
288 /// $$ F(z) = 1 - \sqrt{ 1 - 4 \exp(-(z+1)) } $$
289 ///
290 /// Or if the value under square root is negative: $F(z)= 1$
291 /// This approximation should be overly conservative for the small samples.
292 ///
293 fn new_ad(stat: f64, n: f64) -> Self {
294 let p = 1.0 - anderson_darling_cdf(stat);
295 Self { stat, n, p }
296 }
297
298 /// Get the p-value of the test
299 ///
300 /// Probability of observing the data under the assumption that null hypothesis holds
301 ///
302 pub fn p_value(&self) -> f64 {
303 self.p
304 }
305
306 /// Get the value of the test statistic
307 ///
308 pub fn stat(&self) -> f64 {
309 self.stat
310 }
311}
312
313///
314/// Iterator over the terms of Kuiper asymptotic formula for distribution of the test statistic.
315///
316/// The series is derived in the original paper (Kuiper 1960) and its sum should give
317/// the complement of cumulative distribution function for large sample sizes.
318///
319/// However for the small threshold parameter z the summation can become unstable, thus we
320/// set the limit for the z for which the terms can be evaluated.
321///
322struct KuiperTerms {
323 z: f64,
324 n: f64,
325 i: usize,
326}
327
328impl KuiperTerms {
329 /// Stability threshold, below the summation may become unstable
330 pub const Z_MIN: f64 = 0.1;
331
332 /// Create new sequence of Kuiper terms
333 ///
334 /// # Arguments
335 /// - `z` : value of the threshold parameter P( √n * V > z) where `V` is the Kuiper statistic.
336 /// - `n` : effective population of the sample
337 ///
338 /// # Panics
339 /// If `z <= Self::Z_MIN` or `n` is not positive.
340 ///
341 fn new(z: f64, n: f64) -> Self {
342 if z <= Self::Z_MIN {
343 panic!("Value of the test statistic is too small {z} for the series to converge");
344 } else if n <= 0.0 {
345 panic!("Effective population {n} must be +ve value");
346 }
347 Self { z, n, i: 0 }
348 }
349}
350
351impl Iterator for KuiperTerms {
352 type Item = f64;
353
354 fn next(&mut self) -> Option<Self::Item> {
355 self.i += 1;
356 let i = self.i as f64;
357 let zi_sq = (self.z * i).powi(2);
358 let exp_term = f64::exp(-2. * zi_sq);
359 let factor = 8.0 * self.z / (3. * self.n.sqrt());
360 let term = (2. * (4. * zi_sq - 1.) + factor * i.powi(2) * (4. * zi_sq - 3.)) * exp_term;
361 Some(term)
362 }
363}
364
365///
366/// Builds a Brownian bridge of two [Ecdf]s
367///
368/// The `top` ecdf gives positive contributions, `bottom` negative.
369/// We iterate through both sorted sets of samples in-order and depending
370/// whether a sample from the `top` or `bottom` ecdf was taken we nudge the
371/// sum up or down respectively.
372///
373/// We skip over the repeated values. Thus for samples:
374/// [1, 2, 2, 3]
375/// [2]
376/// We get the sequence:
377/// [0.25, -0.25, 0.0]
378///
379struct EcdfBridge<'a, T>
380where
381 T: PartialOrd + Copy,
382{
383 top: Peekable<std::slice::Iter<'a, T>>,
384 bottom: Peekable<std::slice::Iter<'a, T>>,
385 delta_top: f64,
386 delta_bottom: f64,
387 sum: f64,
388 count: usize,
389}
390
391impl<'a, T> EcdfBridge<'a, T>
392where
393 T: PartialOrd + Copy,
394{
395 pub fn new(ecdf_top: &'a Ecdf<T>, ecdf_bottom: &'a Ecdf<T>) -> Self {
396 Self {
397 top: ecdf_top.samples.iter().peekable(),
398 bottom: ecdf_bottom.samples.iter().peekable(),
399 delta_top: 1.0 / ecdf_top.samples.len() as f64,
400 delta_bottom: 1.0 / ecdf_bottom.samples.len() as f64,
401 sum: 0.0,
402 count: 0,
403 }
404 }
405
406 /// Advance the top ecdf
407 ///
408 /// Needs to loop over repeated values.
409 ///
410 /// # Panics
411 /// If it is called when `top` is already empty
412 ///
413 fn advance_top(&mut self) -> f64 {
414 loop {
415 let v = self.top.next().unwrap();
416 self.count += 1;
417 self.sum += self.delta_top;
418
419 let next = self.top.peek();
420
421 if next.is_none() || *next.unwrap() != v {
422 break;
423 }
424 }
425 self.sum
426 }
427
428 /// Advance the bottom ecdf
429 ///
430 /// Needs to loop over repeated values.
431 ///
432 /// # Panics
433 /// If it is called when `bottom` is already empty
434 ///
435 fn advance_bottom(&mut self) -> f64 {
436 loop {
437 let v = self.bottom.next().unwrap();
438 self.count += 1;
439 self.sum -= self.delta_bottom;
440
441 let next = self.bottom.peek();
442
443 if next.is_none() || *next.unwrap() != v {
444 break;
445 }
446 }
447 self.sum
448 }
449
450 /// Enumerate the samples of the bridge
451 ///
452 /// Iterates over `(i, val)` where `i` is the count of all samples. Like
453 /// [EcdfBridge] iterator skips over repeated samples, but they are included
454 /// in the count. Thus the increments of `i` may be larger than `1`.
455 ///
456 pub fn enumerate_samples(self) -> EnumeratedEcdfBridge<'a, T> {
457 EnumeratedEcdfBridge { bridge: self }
458 }
459}
460
461impl<'a, T> Iterator for EcdfBridge<'a, T>
462where
463 T: PartialOrd + Copy,
464{
465 type Item = f64;
466
467 fn next(&mut self) -> Option<Self::Item> {
468 let top = self.top.peek();
469 let bottom = self.bottom.peek();
470
471 match (top, bottom) {
472 (Some(t), Some(b)) => {
473 if t < b {
474 Some(self.advance_top())
475 } else if t == b {
476 // We need to be careful if same element is present in both `top` and `bottom`.
477 // In that case we need to advance both iterators
478 self.advance_top();
479 Some(self.advance_bottom())
480 } else {
481 Some(self.advance_bottom())
482 }
483 }
484 (None, Some(_)) => Some(self.advance_bottom()),
485 (Some(_), None) => Some(self.advance_top()),
486 (None, None) => None,
487 }
488 }
489}
490
491///
492/// Returned from [EcdfBridge::enumerate_samples]
493///
494/// Wraps around [EcdfBridge] and allows to enumerate the samples, but
495/// unlike the ordinary [std::iter::Iterator::enumerate] accounts for the
496/// 'skips' (non 1 increment) due to the repeated samples.
497///
498struct EnumeratedEcdfBridge<'a, T>
499where
500 T: PartialOrd + Copy,
501{
502 bridge: EcdfBridge<'a, T>,
503}
504
505impl<'a, T> Iterator for EnumeratedEcdfBridge<'a, T>
506where
507 T: PartialOrd + Copy,
508{
509 type Item = (usize, f64);
510
511 fn next(&mut self) -> Option<Self::Item> {
512 match self.bridge.next() {
513 Some(v) => Some((self.bridge.count - 1, v)),
514 None => None,
515 }
516 }
517}
518
519///
520/// Perform one sample Kolmogorov-Smirnov statistical test
521///
522/// Evaluates the Kolmogorov-Smirnov statistic:
523///
524/// $$ KS = \max |F_1(x) - F_{ref}(x)| $$
525///
526/// Where $F_1$ is the empirical cumulative distribution function of the sample
527/// and the $F_{ref}$ the reference distribution provided by the user as an argument.
528/// The p-value is calculated from the statistic as prescribed in (Press 2007).
529///
530/// # References
531/// - Press W. H. , Teukolsky S. A., Vetterling W. T., Flannery B. P. (2007).
532/// Numerical Recipes 3rd Edition: The Art of Scientific Computing (3rd. ed.).
533/// Cambridge University Press, USA.
534///
535pub fn ks1_test<T>(cdf: impl Fn(&T) -> f64, samples: Vec<T>) -> Result<TestResult, TestError>
536where
537 T: PartialOrd + Copy,
538{
539 let n = samples.len();
540 let ecdf = Ecdf::new(samples)?;
541
542 let mut stat = 0.0;
543
544 for (ecdf_value, v) in &ecdf {
545 let new_stat = (cdf(&v) - ecdf_value).abs();
546 if new_stat > stat {
547 stat = new_stat;
548 }
549 }
550
551 Ok(TestResult::new_ks(stat, n as f64))
552}
553
554///
555/// Preform two sample Kolmogorov-Smirnov test
556///
557/// Evaluates the Kolmogorov-Smirnov statistic:
558///
559/// $$ KS = \max |F_1(x) - F_2(x)| $$
560///
561/// Where both $F_1$ and $F_2$ are empirical cumulative distribution functions.
562/// The p-value is calculated from the statistic as prescribed in (Press 2007)
563///
564/// # References
565/// - Press W. H. , Teukolsky S. A., Vetterling W. T., Flannery B. P. (2007).
566/// Numerical Recipes 3rd Edition: The Art of Scientific Computing (3rd. ed.).
567/// Cambridge University Press, USA.
568///
569pub fn ks2_test<T>(sample1: Vec<T>, sample2: Vec<T>) -> Result<TestResult, TestError>
570where
571 T: PartialOrd + Copy,
572{
573 let n1 = sample1.len() as f64;
574 let n2 = sample2.len() as f64;
575
576 let ecdf1 = Ecdf::new(sample1)?;
577 let ecdf2 = Ecdf::new(sample2)?;
578
579 // Compute statistic
580 let stat = EcdfBridge::new(&ecdf1, &ecdf2)
581 .map(|v| v.abs())
582 .max_by(|a, b| a.partial_cmp(b).expect("PartialOrd comparison failed"))
583 .expect("Empty iterator?!");
584
585 Ok(TestResult::new_ks(stat, n1 * n2 / (n1 + n2)))
586}
587
588///
589/// Preform 2-sample Kuiper's test
590///
591/// Kuiper's test is the modification of the Kolmogorov-Smirnov test. Instead
592/// of searching for maximum deviation between empirical cdfs, it uses the
593/// following as the test statistic:
594/// $$ K = \max{[F_1(x) - F_2(x)]} + \min{[F_1(x) - F_2(x)]} $$
595///
596/// Where $F_i(x)$ is of course empirical cdf for the i-th sample.
597///
598/// The particularly interesting feature of the Kuiper's test is, that if the
599/// support is a circle, the test statistic is invariant under rotation.
600/// Thus it is used particularly often for angular distributions.
601/// It is also said to be more sensitive 'at the tails' of the distribution than
602/// KS test.
603///
604/// # References
605/// - Kuiper, N.H. (1960). Tests concerning random points on a circle.
606/// Indagationes Mathematicae (Proceedings), 63, 38-47.
607/// <https://doi.org/10.1016/S1385-7258(60)50006-0>
608///
609///
610pub fn kuiper2_test<T>(sample1: Vec<T>, sample2: Vec<T>) -> Result<TestResult, TestError>
611where
612 T: PartialOrd + Copy,
613{
614 let n1 = sample1.len() as f64;
615 let n2 = sample2.len() as f64;
616
617 let ecdf1 = Ecdf::new(sample1)?;
618 let ecdf2 = Ecdf::new(sample2)?;
619
620 let mut minimum = 0.0;
621 let mut maximum = 0.0;
622
623 for v in EcdfBridge::new(&ecdf1, &ecdf2) {
624 if v < minimum {
625 minimum = v
626 } else if v > maximum {
627 maximum = v
628 }
629 }
630 let stat = minimum.abs() + maximum.abs();
631
632 Ok(TestResult::new_kuiper(stat, n1 * n2 / (n1 + n2)))
633}
634
635///
636/// Perform Anderson-Darling two sample test
637///
638/// Should only be used with continuous distributions and large samples!
639/// (More than few 100s).
640///
641/// Implementation is based on the (Pettitt 1976) paper. As the result can be
642/// used only for continuous distributions where the duplicate values in the
643/// samples have vanishingly small probability to occur.
644///
645/// Note that the (Pettitt 1976) seems to have slightly different behaviour
646/// from the $ A^2\_{kN} $ statistic of (Scholz 1987) if the duplicate samples
647/// are present.
648///
649/// The p-value of the test is obtained using the simplified expression of
650/// (Marsaglia 2004) [see the C code distributed with the paper]. The p-value
651/// calculation uses large-sample approximation and may overestimate the
652/// distribution for smaller sample sizes.
653///
654/// # References
655/// - Pettitt, A. N. (1976). A Two-Sample Anderson--Darling Rank Statistic. Biometrika, 63(1),
656/// 161–168. <https://doi.org/10.2307/2335097>
657/// - Scholz, F. W., & Stephens, M. A. (1987). K-Sample Anderson-Darling Tests.
658/// Journal of the American Statistical Association, 82(399), 918–924.
659/// <https://doi.org/10.2307/2288805>
660/// - Marsaglia, G., & Marsaglia, J. (2004). Evaluating the Anderson-Darling
661/// Distribution. Journal of Statistical Software, 9(2), 1–5.
662/// <https://doi.org/10.18637/jss.v009.i02>
663///
664pub fn ad2_test<T>(sample1: Vec<T>, sample2: Vec<T>) -> Result<TestResult, TestError>
665where
666 T: PartialOrd + Copy,
667{
668 let n1 = sample1.len() as f64;
669 let n2 = sample2.len() as f64;
670 let tot_n = sample1.len() + sample2.len();
671
672 let ecdf1 = Ecdf::new(sample1)?;
673 let ecdf2 = Ecdf::new(sample2)?;
674
675 let mut stat = 0.0;
676
677 for (idx, v) in EcdfBridge::new(&ecdf1, &ecdf2).enumerate_samples() {
678 let denum = (idx + 1) * (tot_n - idx - 1);
679
680 // We need to exclude last entry which will be a NaN [0./0.]
681 if denum != 0 {
682 stat += v.powi(2) / denum as f64
683 }
684 }
685
686 // Scale by the population
687 stat *= n1 * n2;
688
689 Ok(TestResult::new_ad(stat, tot_n as f64))
690}
691
692#[cfg(test)]
693mod tests {
694 use super::*;
695
696 #[test]
697 fn test_ad_cdf() {
698 let val = vec![0.0, 0.3, 2.0, 3.0];
699 let ref_vals = vec![0.0, 0.06183989, 0.90816425, 0.97263617];
700
701 let rhs = val
702 .iter()
703 .map(|x| anderson_darling_cdf(*x))
704 .collect::<Vec<f64>>();
705
706 for (l, r) in std::iter::zip(ref_vals, rhs) {
707 approx::assert_relative_eq!(l, r, max_relative = 1.0e-6);
708 }
709 }
710
711 #[test]
712 fn test_ecdf() {
713 let samples = [0.3, 0.1, 0.5, 0.7];
714 let ecdf = Ecdf::new(samples.into()).unwrap();
715
716 assert_eq!(0.0, ecdf.get(-2.0));
717
718 assert_eq!(0.25, ecdf.get(0.1));
719 assert_eq!(0.25, ecdf.get(0.2));
720
721 assert_eq!(0.75, ecdf.get(0.5));
722 assert_eq!(0.75, ecdf.get(0.6));
723
724 assert_eq!(1.0, ecdf.get(0.7));
725 assert_eq!(1.0, ecdf.get(0.71));
726 }
727
728 #[test]
729 fn test_ks1_test() {
730 let samples = [0.3, 0.2, 0.25, 0.1, 0.9, 0.6];
731 let lhs = TestResult::new_ks(4.0 / 6.0 - 0.3, 6.0);
732 let rhs = ks1_test(|x| *x, samples.into()).unwrap();
733 assert_eq!(lhs, rhs);
734 }
735
736 #[test]
737 fn test_ks1_error() {
738 let samples = [0.3, 0.2, 0.25, 0.1, 0.9, 0.6, f64::NAN];
739 assert!(
740 ks1_test(|x| *x, samples.into()).is_err(),
741 "Failed to detect nan in the list"
742 )
743 }
744
745 #[test]
746 fn test_ks2_test() {
747 let samples1 = [0.3, 0.2, 0.25, 0.1, 0.9, 0.6];
748 let samples2 = [0.1, 0.8, 0.34, 0.09, 0.12, 0.81];
749
750 let rhs = ks2_test(samples1.into(), samples2.into()).unwrap();
751 let lhs = TestResult::new_ks(1.0 / 3.0, 3.0);
752 assert_eq!(lhs, rhs);
753 }
754
755 #[test]
756 fn test_ks2_test_repeated_samples() {
757 let samples1 = [1, 2, 2, 3];
758 let samples2 = [2];
759
760 let rhs = ks2_test(samples1.into(), samples2.into()).unwrap();
761 let lhs = TestResult::new_ks(0.25, 0.8);
762 assert_eq!(lhs, rhs);
763 }
764
765 #[test]
766 fn test_kuiper() {
767 let sample1 = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
768 let sample2 = vec![11, 2, 3, 4, 5, 6, 7, 8, 9, 10];
769
770 let rhs = kuiper2_test(sample1, sample2).unwrap();
771 let lhs = TestResult::new_kuiper(0.1, 5.0);
772
773 println!("{rhs:#?}");
774 assert_eq!(lhs, rhs);
775 }
776
777 #[test]
778 fn test_kuiper_same_sample() {
779 let sample1 = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
780
781 let rhs = kuiper2_test(sample1.clone(), sample1.clone()).unwrap();
782 let lhs = TestResult::new_kuiper(0.0, 5.0);
783
784 println!("{rhs:#?}");
785 assert_eq!(lhs, rhs);
786 }
787
788 #[test]
789 fn test_kuiper_2() {
790 let sample1 = vec![0.07, 0.74, 0.20, 0.55, 0.33, 0.98, 0.32, 0.36, 0.86, 0.43];
791 let sample2 = vec![0.73, 0.10, 0.49, 0.18, 0.87, 0.25, 0.80, 0.54, 0.90, 0.06];
792
793 let rhs = kuiper2_test(sample1, sample2).unwrap();
794 let lhs = TestResult::new_kuiper(0.4, 5.0);
795
796 println!("{rhs:#?}");
797 assert_eq!(lhs, rhs);
798 }
799
800 #[test]
801 fn test_ad() {
802 let sample1 = vec![0.07, 0.74, 0.20, 0.55, 0.33, 0.98, 0.32, 0.36, 0.86, 0.43];
803 let sample2 = vec![0.73, 0.10, 0.49, 0.18, 0.87, 0.25, 0.80, 0.54, 0.90, 0.06];
804
805 let rhs = ad2_test(sample1, sample2).unwrap();
806
807 // Reference value taken from SciPy
808 // (Internal `_anderson_ksamp_right` was used to get statistic before normalisation)
809 let lhs = TestResult::new_ad(0.3634446006350032, 20.0);
810
811 println!("{rhs:#?}");
812 assert_eq!(lhs, rhs);
813 }
814
815 #[test]
816 fn test_ecdf_bridge() {
817 let cases = [
818 (vec![1, 2, 2, 3], vec![2], vec![0.25, -0.25, 0.0]),
819 (vec![2], vec![1, 2, 2, 3], vec![-0.25, 0.25, 0.0]),
820 (vec![1, 2, 2, 3], vec![1, 2, 2, 3], vec![0.0, 0.0, 0.0]),
821 (vec![1, 2, 3, 4], vec![3, 3], vec![0.25, 0.5, -0.25, 0.0]),
822 (vec![2], vec![1, 2, 2, 2], vec![-0.25, 0.0]),
823 ];
824
825 for (s1, s2, reference) in cases {
826 let ecdf1 = Ecdf::new(s1).unwrap();
827 let ecdf2 = Ecdf::new(s2).unwrap();
828
829 let bridge = EcdfBridge::new(&ecdf1, &ecdf2).collect::<Vec<_>>();
830
831 assert_eq!(bridge, reference);
832 }
833 }
834
835 #[test]
836 fn test_ecdf_enumerated_bridge() {
837 let sample1 = vec![1, 2, 2, 3, 3];
838 let sample2 = vec![0, 2, 4];
839
840 let ecdf1 = Ecdf::new(sample1).unwrap();
841 let ecdf2 = Ecdf::new(sample2).unwrap();
842
843 let ref_count: Vec<usize> = vec![0, 1, 4, 6, 7];
844
845 let count = EcdfBridge::new(&ecdf1, &ecdf2)
846 .enumerate_samples()
847 .map(|(i, _)| i)
848 .collect::<Vec<usize>>();
849
850 assert_eq!(ref_count, count);
851 }
852
853 #[test]
854 fn test_ks_cdf_complement() {
855 // Approximate reference points obtained from SciPy
856 let test_points = [
857 (0.0, 1.0),
858 (1.0, 2.6999967168e-01),
859 (2.0, 6.7092525578e-04),
860 (3.0, 3.045996e-08),
861 (100.0, 0.0),
862 ];
863
864 for (x, val) in test_points {
865 approx::assert_relative_eq!(
866 val,
867 TestResult::complement_ks_cdf(x),
868 max_relative = 1.0e-7
869 );
870 }
871 }
872
873 #[test]
874 #[should_panic]
875 fn test_ks_cdf_complement_invalid_range() {
876 TestResult::complement_ks_cdf(-2.0);
877 }
878}