use crate::dist::KsTwoAsymptotic;
use crate::traits::Cdf;
use num::integer::binomial;
use num::Integer;
pub fn ks_test<X, F>(xs: &[X], cdf: F) -> (f64, f64)
where
X: Copy + PartialOrd,
F: Fn(X) -> f64,
{
let mut xs_r: Vec<X> = xs.to_vec();
xs_r.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let n: f64 = xs_r.len() as f64;
let d = xs_r.iter().enumerate().fold(0.0, |acc, (i, &x)| {
let diff = ((i as f64) / n - cdf(x)).abs();
if diff > acc {
diff
} else {
acc
}
});
let p = 1.0 - ks_cdf(xs.len(), d);
(d, p)
}
const KS_AUTO_CUTOVER: usize = 10_000;
pub enum KsMode {
Exact,
Asymptotic,
Auto,
}
impl Default for KsMode {
fn default() -> KsMode {
KsMode::Auto
}
}
pub enum KsAlternative {
TwoSided,
Less,
Greater,
}
impl Default for KsAlternative {
fn default() -> KsAlternative {
KsAlternative::TwoSided
}
}
#[derive(Debug)]
pub enum KsError {
EmptySlice,
TooLongForExact,
}
#[allow(clippy::clippy::many_single_char_names)]
pub fn ks_two_sample<X>(
xs: &[X],
ys: &[X],
mode: KsMode,
alternative: KsAlternative,
) -> Result<(f64, f64), KsError>
where
X: Copy + PartialOrd,
{
if xs.is_empty() || ys.is_empty() {
return Err(KsError::EmptySlice);
}
let n_x = xs.len();
let n_y = ys.len();
let n_x_f = xs.len() as f64;
let n_y_f = ys.len() as f64;
let mut xs = xs.to_vec();
xs.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let mut ys = ys.to_vec();
ys.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let mut cdf_x = Vec::new();
let mut cdf_y = Vec::new();
for x in [&xs[..], &ys[..]].concat() {
match (&xs).binary_search_by(|b| b.partial_cmp(&x).unwrap()) {
Ok(z) => cdf_x.push((z as f64) / n_x_f),
Err(z) => cdf_x.push((z as f64) / n_x_f),
}
match (&ys).binary_search_by(|b| b.partial_cmp(&x).unwrap()) {
Ok(z) => cdf_y.push((z as f64) / n_y_f),
Err(z) => cdf_y.push((z as f64) / n_y_f),
}
}
let (min_s, max_s) = cdf_x
.iter()
.zip(cdf_y.iter())
.map(|(cx, cy)| (cx - cy))
.fold((std::f64::MAX, std::f64::MIN), |(min, max), z| {
let new_min = min.min(z);
let new_max = max.max(z);
(new_min, new_max)
});
let min_s = -min_s;
let stat = match alternative {
KsAlternative::Less => min_s,
KsAlternative::Greater => max_s,
KsAlternative::TwoSided => max_s.max(min_s),
};
let g = n_x.gcd(&n_y);
let g_f = g as f64;
let n_x_g = n_x_f / (g as f64);
let n_y_g = n_y_f / (g as f64);
let use_method = match mode {
KsMode::Asymptotic => KsMode::Asymptotic,
KsMode::Auto => {
if n_x.max(n_y) <= KS_AUTO_CUTOVER {
KsMode::Exact
} else {
KsMode::Asymptotic
}
}
KsMode::Exact => {
if n_x_g > std::f64::MAX / n_y_g {
return Err(KsError::TooLongForExact);
}
KsMode::Exact
}
};
match use_method {
KsMode::Exact => {
let lcm = (n_x_f / g_f) * n_y_f;
let h = (stat * lcm).round();
let stat = h / lcm;
if h == 0.0 {
Ok((stat, 1.0))
} else {
match alternative {
KsAlternative::TwoSided => {
if n_x == n_y {
Ok((stat, paths_outside_proportion(n_x, h)))
} else {
Ok((
stat,
1.0 - paths_inside_proportion(n_x, n_y, g, h),
))
}
}
_ => {
if n_x == n_y {
let p = (0..(h as usize)).fold(1.0, |p, j| {
((n_x - j) as f64) * p
/ (n_x_f + (j as f64) + 1.0)
});
Ok((stat, p))
} else {
let paths = paths_outside(n_x, n_y, g, h);
let bin = binomial(n_x + n_y, n_x);
Ok((stat, (paths as f64) / (bin as f64)))
}
}
}
}
}
KsMode::Asymptotic => {
let ks_dist = KsTwoAsymptotic::new();
match alternative {
KsAlternative::TwoSided => {
let en = (n_x_f * n_y_f / (n_y_f + n_x_f)).sqrt();
Ok((stat, 1.0 - ks_dist.cdf(&(en * stat))))
}
_ => {
let m = n_x.max(n_y) as f64;
let n = n_x.min(n_y) as f64;
let z = (m * n / (m + n)).sqrt() * stat;
let expt = -2.0 * z.powi(2)
- 2.0 * z * (m + 2.0 * n)
/ (m * n * (m + n)).sqrt()
/ 3.0;
let p = expt.exp();
Ok((stat, p))
}
}
}
KsMode::Auto => unreachable!(),
}
}
#[allow(clippy::clippy::many_single_char_names)]
fn paths_outside(m: usize, n: usize, g: usize, h: f64) -> usize {
let (m, n) = (m.max(n), m.min(n));
let mg = m / g;
let ng = n / g;
let ng_f = ng as f64;
let mg_f = mg as f64;
let xj: Vec<usize> = (0..=n)
.map(|j| ((h + mg_f * (j as f64)) / ng_f).ceil() as usize)
.filter(|&x| x <= m)
.collect();
let lxj = xj.len();
if lxj == 0 {
binomial(m + n, n)
} else {
let mut b: Vec<usize> = (0..lxj).map(|_| 0).collect();
b[0] = 1;
for j in 1..lxj {
let mut bj = binomial(xj[j] + j, j);
for i in 0..j {
let bin = binomial(xj[j] - xj[i] + j - i, j - i);
let dec = bin * b[i];
bj -= dec;
}
b[j] = bj;
}
let mut num_paths = 0;
for j in 0..lxj {
let bin = binomial((m - xj[j]) + (n - j), n - j);
let term = b[j] * bin;
num_paths += term
}
num_paths
}
}
fn paths_outside_proportion(n: usize, h: f64) -> f64 {
let mut p = 0.0;
let n_f = n as f64;
let k_max = (n_f / h) as usize;
for k in (0..=k_max).rev() {
let mut p1 = 1.0;
for j in 0..(h as usize) {
let j_f = j as f64;
let k_f = k as f64;
p1 = (n_f - k_f * h - j_f) * p1 / (n_f + k_f * h + j_f + 1.0);
}
p = p1 * (1.0 - p);
}
2.0 * p
}
#[allow(clippy::clippy::many_single_char_names)]
fn paths_inside_proportion(m: usize, n: usize, g: usize, h: f64) -> f64 {
let (m, n) = (m.max(n), m.min(n));
let n_f = n as f64;
let mg = m / g;
let ng = n / g;
let mg_f = mg as f64;
let ng_f = ng as f64;
let mut min_j = 0;
let mut max_j = (n + 1).min((h / (mg as f64)).ceil() as usize);
let mut cur_len = max_j - min_j;
let len_a = (n + 1).min(2 * max_j + 2);
let mut a: Vec<f64> = (0..len_a)
.map(|i| if i >= min_j && i < max_j { 1.0 } else { 0.0 })
.collect();
for i in 1..=m {
let i_f = i as f64;
let last_min_j = min_j;
let last_len = cur_len;
min_j = (((ng_f * i_f - h) / mg_f).floor() + 1.0).max(0.0) as usize;
min_j = min_j.min(n);
max_j = ((((ng_f * i_f + h) / mg_f).floor() + 1.0) as usize).max(n + 1);
if max_j <= min_j {
return 0.0;
}
for j in 0..(max_j - min_j) {
a[j] = a[min_j - last_min_j..max_j - last_min_j].iter().sum();
}
cur_len = max_j - min_j;
if last_len > cur_len {
for a_part in
a.iter_mut().skip(max_j - min_j).take(last_len - cur_len)
{
*a_part = 0.0;
}
}
let scaling_factor = i_f / (n_f + i_f);
a = a.iter().map(|x| x * scaling_factor).collect();
}
a[max_j - min_j - 1]
}
fn mmul(xs: &[Vec<f64>], ys: &[Vec<f64>]) -> Vec<Vec<f64>> {
let m = xs.len();
let mut zs = vec![vec![0.0; m]; m];
for i in 0..m {
for j in 0..m {
zs[i][j] = (0..m).fold(0.0, |acc, k| acc + xs[i][k] * ys[k][j])
}
}
zs
}
fn mpow(xs: &[Vec<f64>], ea: i32, n: usize) -> (Vec<Vec<f64>>, i32) {
let m = xs.len();
if n == 1 {
(xs.to_owned(), ea)
} else {
let (mut zs, mut ev) = mpow(xs, ea, n / 2);
let ys = mmul(&zs, &zs);
let eb = 2 * ev;
if n % 2 == 0 {
zs = ys;
ev = eb;
} else {
zs = mmul(&xs, &ys);
ev = ea + eb;
}
if zs[m / 2][m / 2] > 1E140 {
zs.iter_mut().for_each(|zs_i| {
zs_i.iter_mut().for_each(|z| (*z) *= 1E-140);
});
ev += 140;
}
(zs, ev)
}
}
#[allow(clippy::needless_range_loop)]
#[allow(clippy::many_single_char_names)]
fn ks_cdf(n: usize, d: f64) -> f64 {
let nf = n as f64;
let s: f64 = d * d * nf;
if s > 7.24 || (s > 3.76 && n > 99) {
1.0 - 2.0 * (-(2.000_071 + 0.331 / nf.sqrt() + 1.409 / nf) * s).exp()
} else {
let k: usize = ((nf * d) as usize) + 1;
let m: usize = 2 * k - 1;
let h: f64 = (k as f64) - nf * d;
let mut hs = vec![vec![0.0; m]; m];
for i in 0..m {
for j in 0..m {
if ((i as i32) - (j as i32) + 1) >= 0 {
hs[i][j] = 1.0
}
}
}
for i in 0..m {
hs[i][0] -= h.powi((i as i32) + 1);
hs[m - 1][i] -= h.powi((m as i32) - (i as i32))
}
hs[m - 1][0] += if 2.0 * h - 1.0 > 0.0 {
(2.0 * h - 1.0).powi(m as i32)
} else {
0.0
};
for i in 0..m {
for j in 0..m {
if (i as i32) - (j as i32) + 1 > 0 {
for g in 1..=i - j + 1 {
hs[i][j] /= g as f64;
}
}
}
}
let (qs, mut eq) = mpow(&hs, 0, n);
let mut s = qs[k - 1][k - 1];
for i in 1..n {
s *= (i as f64) / nf;
if s < 1e-140 {
s *= 1e140;
eq -= 140;
}
}
s * 10.0_f64.powi(eq)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dist::Gaussian;
use crate::traits::Cdf;
const TOL: f64 = 1E-12;
#[test]
fn ks_cdf_normal() {
assert::close(ks_cdf(10, 0.274), 0.6284796154565043, TOL);
}
#[test]
fn ks_cdf_large_n() {
assert::close(ks_cdf(1000, 0.074), 0.9999671735299037, TOL);
}
#[test]
fn ks_test_pval() {
let xs: Vec<f64> =
vec![0.42, 0.24, 0.86, 0.85, 0.82, 0.82, 0.25, 0.78, 0.13, 0.27];
let g = Gaussian::standard();
let cdf = |x: f64| g.cdf(&x);
let (ks, p) = ks_test(&xs, cdf);
assert::close(ks, 0.551_716_786_654_561_1, TOL);
assert::close(p, 0.0021804502526949765, TOL);
}
#[test]
fn ks_two_sample_exact() {
let xs = [
0.95692026,
1.1348812,
-0.76579239,
-0.58065653,
-0.05122393,
0.71598754,
1.39873528,
0.42790527,
1.84580764,
0.64228521,
];
let ys = [
0.6948678,
-0.3741825,
0.36657279,
1.15834174,
-0.32421706,
-0.38499295,
1.44976991,
0.2504608,
-0.53694774,
1.42221993,
];
let (stat, alpha) =
ks_two_sample(&xs, &ys, KsMode::Exact, KsAlternative::TwoSided)
.unwrap();
assert::close(stat, 0.3, 1E-8);
assert::close(alpha, 0.7869297884777761, 1E-8);
let (stat, alpha) =
ks_two_sample(&xs, &ys, KsMode::Exact, KsAlternative::Less)
.unwrap();
assert::close(stat, 0.3, 1E-8);
assert::close(alpha, 0.41958041958041953, 1E-8);
let (stat, alpha) =
ks_two_sample(&xs, &ys, KsMode::Exact, KsAlternative::Greater)
.unwrap();
assert::close(stat, 0.2, 1E-8);
assert::close(alpha, 0.6818181818181818, 1E-8);
}
#[test]
fn ks_two_sample_asymp() {
let xs = [
0.95692026,
1.1348812,
-0.76579239,
-0.58065653,
-0.05122393,
0.71598754,
1.39873528,
0.42790527,
1.84580764,
0.64228521,
];
let ys = [
0.6948678,
-0.3741825,
0.36657279,
1.15834174,
-0.32421706,
-0.38499295,
1.44976991,
0.2504608,
-0.53694774,
1.42221993,
];
let (stat, alpha) = ks_two_sample(
&xs,
&ys,
KsMode::Asymptotic,
KsAlternative::TwoSided,
)
.unwrap();
assert::close(stat, 0.3, 1E-8);
assert::close(alpha, 0.7590978384203948, 1E-8);
let (stat, alpha) =
ks_two_sample(&xs, &ys, KsMode::Asymptotic, KsAlternative::Less)
.unwrap();
assert::close(stat, 0.3, 1E-8);
assert::close(alpha, 0.30119421191220214, 1E-8);
let (stat, alpha) =
ks_two_sample(&xs, &ys, KsMode::Asymptotic, KsAlternative::Greater)
.unwrap();
assert::close(stat, 0.2, 1E-8);
assert::close(alpha, 0.5488116360940264, 1E-8);
}
}