lace_stats/
ks.rs

1//! Kolmogorov-Smirnov test
2
3/// Compute the one-sample Kolmogorov-Smirnov statistic between the samples,
4/// `xs`, and the target `CDF`.
5pub fn ks_test<F: Fn(f64) -> f64>(xs: &[f64], cdf: F) -> f64 {
6    let mut xs_r: Vec<f64> = xs.to_vec();
7    xs_r.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
8
9    let n: f64 = xs_r.len() as f64;
10    xs_r.iter().enumerate().fold(0.0, |acc, (i, &x)| {
11        let diff = ((i as f64 + 1.0) / n - cdf(x)).abs();
12        if diff > acc {
13            diff
14        } else {
15            acc
16        }
17    })
18}
19
20// Computes the empirical CDF of xs on the values in vals
21// xs and all_vals must be sorted
22fn empirical_cdf(xs: &[f64], vals: &[f64]) -> Vec<f64> {
23    let n: f64 = xs.len() as f64;
24    let mut cdf: Vec<f64> = Vec::with_capacity(vals.len());
25    let mut ix: usize = 0;
26    vals.iter().for_each(|y| {
27        while ix < xs.len() && *y > xs[ix] {
28            ix += 1;
29        }
30        let p = (ix as f64) / n;
31        cdf.push(p);
32    });
33    cdf
34}
35
36/// Two-sample KS test statistic
37pub fn ks2sample(mut xs: Vec<f64>, mut ys: Vec<f64>) -> f64 {
38    let mut all_vals = xs.clone();
39    all_vals.extend(ys.clone());
40
41    xs.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
42    ys.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
43    all_vals.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
44
45    let cdf_x = empirical_cdf(&xs, &all_vals);
46    let cdf_y = empirical_cdf(&ys, &all_vals);
47
48    cdf_x.iter().zip(cdf_y).fold(0.0, |acc, (px, py)| {
49        let diff = (px - py).abs();
50        if diff > acc {
51            diff
52        } else {
53            acc
54        }
55    })
56}
57
58#[cfg(test)]
59mod tests {
60    use super::*;
61    use approx::*;
62
63    const TOL: f64 = 1E-8;
64
65    #[test]
66    fn empirical_cdf_when_xs_and_vals_same_length() {
67        let vals: Vec<f64> = vec![0.0, 1.5, 2.1, 3.0];
68        let xs: Vec<f64> = vec![1.0, 1.0, 2.0, 2.5];
69
70        let cdf = empirical_cdf(&xs, &vals);
71
72        assert_relative_eq!(cdf[0], 0.0, epsilon = TOL);
73        assert_relative_eq!(cdf[1], 0.5, epsilon = TOL);
74        assert_relative_eq!(cdf[2], 0.75, epsilon = TOL);
75        assert_relative_eq!(cdf[3], 1.0, epsilon = TOL);
76    }
77
78    #[test]
79    fn empirical_cdf_when_xs_all_same_value() {
80        let vals: Vec<f64> = vec![0.0, 1.5, 2.1, 3.0];
81        let xs: Vec<f64> = vec![1.0, 1.0, 1.0, 1.0];
82
83        let cdf = empirical_cdf(&xs, &vals);
84
85        assert_relative_eq!(cdf[0], 0.0, epsilon = TOL);
86        assert_relative_eq!(cdf[1], 1.0, epsilon = TOL);
87        assert_relative_eq!(cdf[2], 1.0, epsilon = TOL);
88        assert_relative_eq!(cdf[3], 1.0, epsilon = TOL);
89    }
90
91    #[test]
92    fn ks2sample_stat_should_be_zero_when_samples_are_identical() {
93        let xs: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0];
94        let ys: Vec<f64> = vec![2.0, 1.0, 4.0, 3.0];
95
96        assert_relative_eq!(0.0, ks2sample(xs, ys), epsilon = TOL);
97    }
98
99    #[test]
100    fn ks2sample_stat_simple_value_test_1() {
101        let xs: Vec<f64> = vec![1.0, 1.0, 4.0, 4.0];
102        let ys: Vec<f64> = vec![1.0, 1.0, 1.0, 4.0];
103
104        assert_relative_eq!(0.25, ks2sample(xs, ys), epsilon = TOL);
105    }
106
107    #[test]
108    fn ks2sample_stat_simple_value_test_2() {
109        let xs: Vec<f64> =
110            vec![0.42, 0.24, 0.86, 0.85, 0.82, 0.82, 0.25, 0.78, 0.13, 0.27];
111        let ys: Vec<f64> =
112            vec![0.24, 0.27, 0.87, 0.29, 0.57, 0.44, 0.5, 0.00, 0.56, 0.03];
113
114        assert_relative_eq!(0.4, ks2sample(xs, ys), epsilon = TOL);
115    }
116}