limma-rust 0.1.0

//! Cumulative overlap analysis of two ranked ID lists.
//!
//! Pure-Rust port of limma's `cumOverlap.R` ([`cum_overlap`]): test whether two
//! ordered lists of IDs are more overlapped near their tops than chance, using
//! hypergeometric tail probabilities with a directed Bonferroni adjustment.

use crate::special::ln_gamma;
use std::collections::{HashMap, HashSet};
use std::hash::Hash;

/// Output of [`cum_overlap`]. `n_overlap`, `p_value`, and `adj_p_value` are
/// indexed by list position `1..=n_total`; `n_min` is the (1-based) cut point
/// with the smallest adjusted p-value and `id_overlap` are the IDs overlapping
/// within the top `n_min` of both lists. An empty intersection yields
/// `n_total = 0` with `n_min = 0`, `p_min = NaN`, and empty vectors.
#[derive(Debug, Clone)]
pub struct CumOverlap<T> {
    pub n_total: usize,
    pub n_min: usize,
    pub p_min: f64,
    pub n_overlap: Vec<usize>,
    pub id_overlap: Vec<T>,
    pub p_value: Vec<f64>,
    pub adj_p_value: Vec<f64>,
}

fn has_duplicate<T: Eq + Hash>(v: &[T]) -> bool {
    let mut seen = HashSet::new();
    v.iter().any(|x| !seen.insert(x))
}

fn lchoose(n: usize, k: usize) -> f64 {
    if k > n {
        return f64::NEG_INFINITY;
    }
    ln_gamma((n + 1) as f64) - ln_gamma((k + 1) as f64) - ln_gamma((n - k + 1) as f64)
}

/// Upper-tail hypergeometric probability `P(X >= a)` for `X` counting white
/// balls in `k` draws from an urn of `m` white and `n` black. Equivalent to R's
/// `phyper(a - 0.5, m, n, k, lower.tail = FALSE)`.
fn phyper_upper(a: usize, m: usize, n: usize, k: usize) -> f64 {
    let lo = k.saturating_sub(n);
    let hi = m.min(k);
    let a = a.max(lo);
    if a > hi {
        return 0.0;
    }
    if a <= lo {
        return 1.0;
    }
    let lck = lchoose(m + n, k);
    let mut s = 0.0;
    for x in a..=hi {
        s += (lchoose(m, x) + lchoose(n, k - x) - lck).exp();
    }
    s.min(1.0)
}

/// `cumOverlap(ol1, ol2)`. Both lists must be duplicate-free (panics otherwise).
/// The lists are first reduced to their common IDs, then for each cut depth the
/// top-`j` overlap is scored by a hypergeometric upper tail and Bonferroni-
/// adjusted by rank.
pub fn cum_overlap<T: Eq + Hash + Clone>(ol1: &[T], ol2: &[T]) -> CumOverlap<T> {
    assert!(!has_duplicate(ol1), "Duplicate IDs found in ol1");
    assert!(!has_duplicate(ol2), "Duplicate IDs found in ol2");

    // Reduce both lists to their shared IDs (ol1 by membership in ol2, then ol2
    // by membership in the reduced ol1), preserving each list's own order.
    let set2: HashSet<&T> = ol2.iter().collect();
    let a: Vec<T> = ol1.iter().filter(|x| set2.contains(x)).cloned().collect();
    let set_a: HashSet<&T> = a.iter().collect();
    let b: Vec<T> = ol2.iter().filter(|x| set_a.contains(x)).cloned().collect();

    let ngenes = a.len();
    if ngenes == 0 {
        return CumOverlap {
            n_total: 0,
            n_min: 0,
            p_min: f64::NAN,
            n_overlap: Vec::new(),
            id_overlap: Vec::new(),
            p_value: Vec::new(),
            adj_p_value: Vec::new(),
        };
    }

    // m[k] = 1-based position of a[k] within b (a permutation of 1..=ngenes).
    let pos_b: HashMap<&T, usize> = b.iter().enumerate().map(|(i, x)| (x, i + 1)).collect();
    let m: Vec<usize> = a.iter().map(|x| pos_b[x]).collect();

    let mut n_overlap = vec![0usize; ngenes];
    let mut p_value = vec![0.0_f64; ngenes];
    for j in 1..=ngenes {
        let nov = m[..j].iter().filter(|&&v| v <= j).count();
        n_overlap[j - 1] = nov;
        p_value[j - 1] = phyper_upper(nov, j, ngenes - j, j);
    }

    // Directed Bonferroni: scale by rank, take the first minimiser, then clamp.
    let mut adj = vec![0.0_f64; ngenes];
    let mut n_min = 1usize;
    let mut min_val = f64::INFINITY;
    for j in 1..=ngenes {
        let pb = p_value[j - 1] * j as f64;
        if pb < min_val {
            min_val = pb;
            n_min = j;
        }
        adj[j - 1] = pb.min(1.0);
    }
    let p_min = adj[n_min - 1];

    let id_overlap: Vec<T> = (0..n_min)
        .filter(|&k| m[k] <= n_min)
        .map(|k| a[k].clone())
        .collect();

    CumOverlap {
        n_total: ngenes,
        n_min,
        p_min,
        n_overlap,
        id_overlap,
        p_value,
        adj_p_value: adj,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn close(a: &[f64], b: &[f64], tol: f64) -> bool {
        a.len() == b.len()
            && a.iter()
                .zip(b)
                .all(|(&x, &y)| (x - y).abs() <= tol + tol * y.abs())
    }

    #[test]
    fn equal_sets_match_r() {
        // Reference: cumOverlap(c(a..h), c(a,c,b,e,d,g,f,h)) in limma 3.68.3.
        let ol1 = ["a", "b", "c", "d", "e", "f", "g", "h"];
        let ol2 = ["a", "c", "b", "e", "d", "g", "f", "h"];
        let o = cum_overlap(&ol1, &ol2);
        assert_eq!(o.n_total, 8);
        assert_eq!(o.n_min, 3);
        assert!((o.p_min - 0.0535714285714286).abs() < 1e-12);
        assert_eq!(o.n_overlap, vec![1, 1, 3, 3, 5, 5, 7, 8]);
        assert!(close(
            &o.p_value,
            &[
                0.125,
                0.464285714285714,
                0.0178571428571429,
                0.242857142857143,
                0.0178571428571429,
                0.464285714285714,
                0.125,
                1.0,
            ],
            1e-12
        ));
        assert!(close(
            &o.adj_p_value,
            &[
                0.125,
                0.928571428571428,
                0.0535714285714286,
                0.971428571428572,
                0.0892857142857143,
                1.0,
                0.875000000000001,
                1.0,
            ],
            1e-12
        ));
        assert_eq!(o.id_overlap, vec!["a", "b", "c"]);
    }

    #[test]
    fn reduces_to_common_ids() {
        // Reference: cumOverlap(c(a,b,x,c,d), c(a,c,b,d,y)) reduces to a,b,c,d.
        let ol1 = ["a", "b", "x", "c", "d"];
        let ol2 = ["a", "c", "b", "d", "y"];
        let o = cum_overlap(&ol1, &ol2);
        assert_eq!(o.n_total, 4);
        assert_eq!(o.n_min, 1);
        assert!((o.p_min - 0.25).abs() < 1e-12);
        assert_eq!(o.n_overlap, vec![1, 1, 3, 4]);
        assert!(close(
            &o.p_value,
            &[0.25, 0.833333333333333, 0.25, 1.0],
            1e-12
        ));
        assert!(close(
            &o.adj_p_value,
            &[0.25, 1.0, 0.749999999999999, 1.0],
            1e-12
        ));
        assert_eq!(o.id_overlap, vec!["a"]);
    }
}