limma-rust 0.1.0

Pure-Rust port of the Bioconductor limma differential-expression package
Documentation
//! Proportion of true null hypotheses.
//!
//! Pure-Rust port of limma's `propTrueNull.R` and `convest.R`.
//! [`prop_true_null`] mirrors `propTrueNull(p, method)` with the three
//! closed-form estimators (`lfdr`, `mean`, `hist`) and the `convest` method;
//! [`convest`] is the standalone convex decreasing density estimate of Langaas,
//! Lindqvist & Ferkingstad (2005).

/// Method for [`prop_true_null`], mirroring `propTrueNull(p, method=...)`.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum PropTrueNullMethod {
    /// `"lfdr"` (limma's default): average local false discovery rate.
    Lfdr,
    /// `"mean"`: twice the mean of the capped p-values.
    Mean,
    /// `"hist"`: histogram tail-mean method (uses `nbins`).
    Hist,
    /// `"convest"`: convex decreasing density estimate ([`convest`]).
    Convest,
}

/// `propTrueNull(p, method, nbins)`: estimate the proportion of true null
/// hypotheses among the p-values `p`. `nbins` is only used by
/// [`PropTrueNullMethod::Hist`] (limma's default is 20); the `Convest` arm runs
/// [`convest`] with limma's defaults (`niter = 100`, `tol = 1e-6`).
pub fn prop_true_null(p: &[f64], method: PropTrueNullMethod, nbins: usize) -> f64 {
    match method {
        PropTrueNullMethod::Lfdr => by_local_fdr(p),
        PropTrueNullMethod::Mean => by_mean_p(p),
        PropTrueNullMethod::Hist => from_histogram(p, nbins),
        PropTrueNullMethod::Convest => convest(p, 100, 1e-6),
    }
}

/// `.propTrueNullByLocalFDR`: average local FDR over the sorted p-values.
fn by_local_fdr(p: &[f64]) -> f64 {
    let n = p.len();
    let mut ps = p.to_vec();
    ps.sort_by(|a, b| b.partial_cmp(a).unwrap()); // decreasing
    let nf = n as f64;
    let n1 = (n + 1) as f64;
    let mut s = 0.0;
    for (k, &pv) in ps.iter().enumerate() {
        let i = (n - k) as f64; // n, n-1, ..., 1
        let q = (nf / i * pv).min(1.0);
        s += i * q;
    }
    s / nf / n1 * 2.0
}

/// `.propTrueNullByMeanP`: twice the mean of `pmin(p_(i), (i-0.5)/n)`.
fn by_mean_p(p: &[f64]) -> f64 {
    let n = p.len();
    let mut ps = p.to_vec();
    ps.sort_by(|a, b| a.partial_cmp(b).unwrap()); // increasing
    let nf = n as f64;
    let mut s = 0.0;
    for (k, &pv) in ps.iter().enumerate() {
        let i = (k + 1) as f64;
        s += pv.min((i - 0.5) / nf);
    }
    2.0 * s / nf
}

/// `.propTrueNullFromHistogram`: tail-mean histogram method on `nbins` bins.
fn from_histogram(p: &[f64], nbins: usize) -> f64 {
    let nb = nbins as f64;
    // Bins are the right-closed intervals (-0.1, 1/nbins], ..., ((nbins-1)/nbins, 1].
    let mut counts = vec![0.0_f64; nbins];
    for &pv in p {
        for t in 1..=nbins {
            if pv <= t as f64 / nb {
                counts[t - 1] += 1.0;
                break;
            }
        }
    }
    // tail_means[k] = mean of counts[k..nbins] = rev(cumsum(rev(counts))/(1:nbins)).
    let mut tail_means = vec![0.0_f64; nbins];
    let mut cum = 0.0;
    for k in (0..nbins).rev() {
        cum += counts[k];
        tail_means[k] = cum / (nbins - k) as f64;
    }
    // First k where the tail mean is at least the bin's own count (always exists
    // at k = nbins-1, where they are equal).
    let mut idx = nbins - 1;
    for k in 0..nbins {
        if tail_means[k] >= counts[k] {
            idx = k;
            break;
        }
    }
    tail_means[idx] / tail_means[0]
}

/// `convest(p, niter, tol)`: convex decreasing density estimate of pi0.
///
/// Iteratively builds the least-concave-majorant density on the 0.01 grid and
/// returns its value at 1, the estimated proportion of true nulls. `p` must lie
/// in `[0, 1]` with no NaNs (limma's `convest` errors otherwise); an empty `p`
/// returns NaN, matching R's `NA`.
pub fn convest(p: &[f64], niter: usize, tol: f64) -> f64 {
    let m = p.len();
    if m == 0 {
        return f64::NAN;
    }
    let ny = tol;
    let mut pv: Vec<f64> = p.to_vec();
    pv.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let p = pv; // sorted ascending

    // p.c = ceiling(100*p)/100, p.f = floor(100*p)/100 (used in the interpolation).
    let p_c: Vec<f64> = p.iter().map(|&v| (100.0 * v).ceil() / 100.0).collect();

    // x.grid = (0:100)/100 (101 points); the t-grid for theta is (1:100)/100.
    let x_grid: Vec<f64> = (0..=100).map(|t| t as f64 / 100.0).collect();

    let mut f_hat = vec![1.0_f64; 101]; // density estimate at the x-grid
    let mut f_hat_p = vec![1.0_f64; m]; // density estimate at the p-values

    let mut theta_hat = argmax_theta(&p, None);
    let mut f_theta_hat = triangular(theta_hat, &x_grid);
    let mut f_theta_hat_p = triangular(theta_hat, &p);

    let mut pi0 = f_hat[100];

    for _ in 0..niter {
        let f_hat_diff: Vec<f64> = (0..m).map(|i| f_hat_p[i] - f_theta_hat_p[i]).collect();

        let s: f64 = (0..m).map(|i| f_hat_diff[i] / f_hat_p[i]).sum();
        let eps = if s > 0.0 {
            0.0
        } else {
            let mut l = 0.0_f64;
            let mut u = 1.0_f64;
            let mut e = 0.5_f64;
            while (u - l).abs() > ny {
                e = (l + u) / 2.0;
                let cond: f64 = (0..m)
                    .filter(|&i| f_hat_p[i] > 0.0)
                    .map(|i| f_hat_diff[i] / ((1.0 - e) * f_hat_p[i] + e * f_theta_hat_p[i]))
                    .sum();
                if cond < 0.0 {
                    l = e;
                } else {
                    u = e;
                }
            }
            e
        };

        for k in 0..101 {
            f_hat[k] = (1.0 - eps) * f_hat[k] + eps * f_theta_hat[k];
        }
        pi0 = f_hat[100];

        // f.hat.p: linear interpolation of f.hat between its floor/ceil grid points.
        for i in 0..m {
            let idx_f = (100.0 * p[i]).floor() as usize;
            let idx_c = (100.0 * p[i]).ceil() as usize;
            let ff = f_hat[idx_f];
            let fc = f_hat[idx_c];
            f_hat_p[i] = 100.0 * (ff - fc) * (p_c[i] - p[i]) + fc;
        }

        theta_hat = argmax_theta(&p, Some(&f_hat_p));
        f_theta_hat = triangular(theta_hat, &x_grid);
        f_theta_hat_p = triangular(theta_hat, &p);

        // If the Unif[0,1] density fits better, switch f.theta.hat to it.
        let s1: f64 = (0..m).map(|i| f_theta_hat_p[i] / f_hat_p[i]).sum();
        let s2: f64 = (0..m).map(|i| 1.0 / f_hat_p[i]).sum();
        if s1 < s2 {
            f_theta_hat = vec![1.0; 101];
            f_theta_hat_p = vec![1.0; m];
        }
    }
    pi0
}

/// `0.01 * which.max_theta sum_i w_i * 2(theta - p_i) 1[p_i < theta] / theta^2`,
/// with `w_i = 1` (`weights = None`) or `w_i = 1/f_hat_p_i`. `which.max` keeps
/// the first maximizing grid point, matching R.
fn argmax_theta(p: &[f64], weights: Option<&[f64]>) -> f64 {
    let mut best_val = f64::NEG_INFINITY;
    let mut best_t = 1usize;
    for t in 1..=100 {
        let theta = t as f64 / 100.0;
        let t2 = theta * theta;
        let mut s = 0.0;
        for (i, &pi) in p.iter().enumerate() {
            if pi < theta {
                let term = 2.0 * (theta - pi) / t2;
                s += match weights {
                    Some(w) => term / w[i],
                    None => term,
                };
            }
        }
        if s > best_val {
            best_val = s;
            best_t = t;
        }
    }
    best_t as f64 / 100.0
}

/// Triangular density `2(theta - x) 1[x < theta] / theta^2` on the points `xs`.
fn triangular(theta: f64, xs: &[f64]) -> Vec<f64> {
    let t2 = theta * theta;
    xs.iter()
        .map(|&x| {
            if x < theta {
                2.0 * (theta - x) / t2
            } else {
                0.0
            }
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    // A fixed p-value vector with signal at the low end so pi0 < 1 and the
    // methods give distinct estimates. Mirrors scratch/proptruenull_ref.R.
    const P: [f64; 30] = [
        0.0001, 0.0007, 0.0023, 0.0041, 0.0089, 0.012, 0.018, 0.027, 0.035, 0.052, 0.071, 0.098,
        0.13, 0.17, 0.21, 0.28, 0.33, 0.41, 0.47, 0.55, 0.58, 0.63, 0.69, 0.74, 0.78, 0.83, 0.88,
        0.91, 0.95, 0.99,
    ];

    fn close(a: f64, b: f64, tol: f64) -> bool {
        (a - b).abs() <= tol + tol * b.abs()
    }

    #[test]
    fn closed_form_methods_match_r() {
        assert!(
            close(
                prop_true_null(&P, PropTrueNullMethod::Lfdr, 20),
                0.700587096774194,
                1e-12
            ),
            "lfdr: {}",
            prop_true_null(&P, PropTrueNullMethod::Lfdr, 20)
        );
        assert!(
            close(
                prop_true_null(&P, PropTrueNullMethod::Mean, 20),
                0.723495555555556,
                1e-12
            ),
            "mean: {}",
            prop_true_null(&P, PropTrueNullMethod::Mean, 20)
        );
        assert!(
            close(
                prop_true_null(&P, PropTrueNullMethod::Hist, 20),
                0.666666666666667,
                1e-12
            ),
            "hist20: {}",
            prop_true_null(&P, PropTrueNullMethod::Hist, 20)
        );
        assert!(
            close(
                prop_true_null(&P, PropTrueNullMethod::Hist, 10),
                0.666666666666667,
                1e-12
            ),
            "hist10: {}",
            prop_true_null(&P, PropTrueNullMethod::Hist, 10)
        );
    }

    #[test]
    fn convest_matches_r() {
        assert!(
            close(convest(&P, 100, 1e-6), 0.654984323974808, 1e-7),
            "convest100: {}",
            convest(&P, 100, 1e-6)
        );
        assert!(
            close(convest(&P, 50, 1e-6), 0.654639572943371, 1e-7),
            "convest50: {}",
            convest(&P, 50, 1e-6)
        );
        assert!(
            close(
                prop_true_null(&P, PropTrueNullMethod::Convest, 20),
                0.654984323974808,
                1e-7
            ),
            "convest via dispatch: {}",
            prop_true_null(&P, PropTrueNullMethod::Convest, 20)
        );
    }

    #[test]
    fn convest_empty_is_nan() {
        assert!(convest(&[], 100, 1e-6).is_nan());
    }
}