limma-rust 0.1.0

//! `neqc.R`: normexp background correction for Illumina BeadArray data.
//!
//! * [`normexp_fit_control`] estimates the normexp parameters `(mu, log sigma,
//!   log alpha)` per array from negative-control probes;
//! * [`normexp_fit_detection_p`] does the same from detection p-values when no
//!   negative controls are available;
//! * [`nec`] applies [`crate::normexp::normexp_signal`] with those parameters
//!   and adds the `offset`;
//! * [`neqc`] runs `nec`, quantile-normalizes across arrays, takes `log2`, and
//!   keeps the regular probes.
//!
//! Matrix path only (limma's `EListRaw` object handling is out of scope for a
//! pure-Rust port). Both `robust = FALSE` (mean and sample SD of the negative
//! controls) and `robust = TRUE` (a `MASS::huber` lognormal fit of the log
//! negative-control intensities) control fits are supported.

use anyhow::{bail, Result};
use ndarray::{Array2, ArrayView1, Axis};

use crate::norm::{normalize_between_arrays, NormalizeMethod};
use crate::normexp::normexp_signal;

/// Index of the first maximum (R's `which.max`).
fn which_max(v: ArrayView1<f64>) -> usize {
    let mut bi = 0usize;
    let mut bv = v[0];
    for i in 1..v.len() {
        if v[i] > bv {
            bv = v[i];
            bi = i;
        }
    }
    bi
}

/// Index of the first minimum (R's `which.min`).
fn which_min(v: ArrayView1<f64>) -> usize {
    let mut bi = 0usize;
    let mut bv = v[0];
    for i in 1..v.len() {
        if v[i] < bv {
            bv = v[i];
            bi = i;
        }
    }
    bi
}

/// Median of a slice, matching R's `median` (mean of the two central order
/// statistics for even length). Input must be `NaN`-free.
fn median(v: &[f64]) -> f64 {
    let mut s: Vec<f64> = v.to_vec();
    s.sort_by(|a, b| a.partial_cmp(b).expect("finite"));
    let n = s.len();
    if n % 2 == 1 {
        s[n / 2]
    } else {
        (s[n / 2 - 1] + s[n / 2]) * 0.5
    }
}

/// `mad(y)` with the default scale constant `1.4826` and median centring.
fn mad(v: &[f64]) -> f64 {
    let center = median(v);
    let dev: Vec<f64> = v.iter().map(|&x| (x - center).abs()).collect();
    1.4826 * median(&dev)
}

/// `MASS::huber(y, k, tol)`: Huber M-estimate of location with a fixed MAD
/// scale `s`. The loop breaks *before* committing the converging update, so the
/// returned `mu` is the location from the previous iteration — matching MASS
/// exactly. Errors if the MAD scale is zero.
fn huber(y: &[f64], k: f64, tol: f64) -> Result<(f64, f64)> {
    let n = y.len() as f64;
    let s = mad(y);
    if s == 0.0 {
        bail!("cannot estimate scale: MAD is zero for negative controls");
    }
    let mut mu = median(y);
    loop {
        let lo = mu - k * s;
        let hi = mu + k * s;
        let mu1 = y.iter().map(|&v| v.clamp(lo, hi)).sum::<f64>() / n;
        if (mu - mu1).abs() < tol * s {
            break;
        }
        mu = mu1;
    }
    Ok((mu, s))
}

/// `normexp.fit.control(x, status, negctrl, regular, robust)`: estimate the
/// normexp parameters per array from negative-control probes. For
/// `robust = false`, `mu`/`sigma` are the mean and sample SD of the negative
/// controls. For `robust = true`, they are the lognormal mean and SD implied by
/// a [`huber`] fit of the log negative-control intensities (`mu = exp(m+s^2/2)`,
/// `sigma = sqrt(omega(omega-1)) exp(m)` with `omega = exp(s^2)`). In both cases
/// `alpha = max(mean(regular) - mu, 10)`. Returns an `narrays x 3` matrix of
/// `(mu, log(sigma), log(alpha))`. `status` is compared case-insensitively.
pub fn normexp_fit_control(
    x: &Array2<f64>,
    status: &[String],
    negctrl: &str,
    regular: &str,
    robust: bool,
) -> Result<Array2<f64>> {
    let (nprobe, narray) = x.dim();
    if status.len() != nprobe {
        bail!(
            "status length ({}) must match number of probes ({})",
            status.len(),
            nprobe
        );
    }
    let reg: Vec<usize> = (0..nprobe)
        .filter(|&g| status[g].eq_ignore_ascii_case(regular))
        .collect();
    let neg: Vec<usize> = (0..nprobe)
        .filter(|&g| status[g].eq_ignore_ascii_case(negctrl))
        .collect();
    if reg.is_empty() {
        bail!("No regular probes found");
    }
    if neg.len() < 2 {
        bail!("Fewer than two negative control probes found");
    }
    let nneg = neg.len() as f64;
    let nreg = reg.len() as f64;

    let mut par = Array2::<f64>::zeros((narray, 3));
    for j in 0..narray {
        let (mu, sigma) = if robust {
            // Huber location/scale of the log negative controls -> lognormal.
            let logneg: Vec<f64> = neg.iter().map(|&g| x[[g, j]].ln()).collect();
            let (m, s) = huber(&logneg, 1.5, 1e-6)?;
            let mu = (m + s * s / 2.0).exp();
            let omega = (s * s).exp();
            let sigma = (omega * (omega - 1.0)).sqrt() * m.exp();
            (mu, sigma)
        } else {
            let mu = neg.iter().map(|&g| x[[g, j]]).sum::<f64>() / nneg;
            let ss = neg
                .iter()
                .map(|&g| {
                    let d = x[[g, j]] - mu;
                    d * d
                })
                .sum::<f64>();
            (mu, (ss / (nneg - 1.0)).sqrt())
        };
        let mean_reg = reg.iter().map(|&g| x[[g, j]]).sum::<f64>() / nreg;
        let alpha = (mean_reg - mu).max(10.0);
        par[[j, 0]] = mu;
        par[[j, 1]] = sigma.ln();
        par[[j, 2]] = alpha.ln();
    }
    Ok(par)
}

/// `normexp.fit.detection.p(x, detection.p)`: infer the negative-control mean
/// and variance from detection p-values. The p-value matrix is flipped to
/// `1 - p` when, on the first array, the most-expressed probe has a smaller
/// p-value than the least-expressed one (limma's "actually 1-pvalues" check).
/// Returns an `narrays x 3` matrix of `(mu, log(sigma), log(alpha))`.
pub fn normexp_fit_detection_p(x: &Array2<f64>, detection_p: &Array2<f64>) -> Result<Array2<f64>> {
    let (nprobe, narray) = x.dim();
    if detection_p.dim() != x.dim() {
        bail!("detection p-value matrix must match intensity dimensions");
    }

    let mut dp = detection_p.clone();
    let y0 = x.column(0);
    let p0 = detection_p.column(0);
    if p0[which_max(y0)] < p0[which_min(y0)] {
        dp.mapv_inplace(|v| 1.0 - v);
    }

    let mut par = Array2::<f64>::zeros((narray, 3));
    for j in 0..narray {
        // Sort observations by (p, y) ascending.
        let mut ord: Vec<usize> = (0..nprobe).collect();
        ord.sort_by(|&a, &b| {
            dp[[a, j]]
                .partial_cmp(&dp[[b, j]])
                .expect("finite p")
                .then(x[[a, j]].partial_cmp(&x[[b, j]]).expect("finite y"))
        });
        let ys: Vec<f64> = ord.iter().map(|&i| x[[i, j]]).collect();
        let ps: Vec<f64> = ord.iter().map(|&i| dp[[i, j]]).collect();

        // First occurrence of each distinct p, dropping the very first: the
        // boundaries where the sorted p changes.
        let jidx: Vec<usize> = (1..nprobe).filter(|&k| ps[k] != ps[k - 1]).collect();
        let ync: Vec<f64> = jidx.iter().map(|&k| (ys[k] + ys[k - 1]) * 0.5).collect();
        let d: Vec<f64> = jidx.iter().map(|&k| ps[k] - ps[k - 1]).collect();
        let dmin = d.iter().copied().fold(f64::INFINITY, f64::min);
        let freq: Vec<f64> = d.iter().map(|&v| v / dmin).collect();
        let sumw: f64 = freq.iter().sum();
        let nn: f64 = sumw; // n = sum(freq)

        let mu = ync
            .iter()
            .zip(freq.iter())
            .map(|(&yv, &w)| yv * w)
            .sum::<f64>()
            / sumw;
        let var_w = ync
            .iter()
            .zip(freq.iter())
            .map(|(&yv, &w)| {
                let dd = yv - mu;
                dd * dd * w
            })
            .sum::<f64>()
            / sumw;
        let sigma = (var_w * nn / (nn - 1.0)).sqrt();

        let mean_x = (0..nprobe).map(|g| x[[g, j]]).sum::<f64>() / nprobe as f64;
        let alpha = (mean_x - mu).max(10.0);
        par[[j, 0]] = mu;
        par[[j, 1]] = sigma.ln();
        par[[j, 2]] = alpha.ln();
    }
    Ok(par)
}

/// `nec(x, status, ...)`: normexp background correction aided by negative
/// controls. When `status` contains probes matching `negctrl`, parameters come
/// from [`normexp_fit_control`]; otherwise `detection_p` must be supplied and
/// [`normexp_fit_detection_p`] is used. Returns the corrected intensities plus
/// `offset` (limma's default `offset = 16`).
#[allow(clippy::too_many_arguments)]
pub fn nec(
    x: &Array2<f64>,
    status: Option<&[String]>,
    negctrl: &str,
    regular: &str,
    offset: f64,
    robust: bool,
    detection_p: Option<&Array2<f64>>,
) -> Result<Array2<f64>> {
    let has_neg = match status {
        Some(s) => s.iter().any(|t| t.eq_ignore_ascii_case(negctrl)),
        None => false,
    };
    let par = if has_neg {
        normexp_fit_control(x, status.expect("status present"), negctrl, regular, robust)?
    } else {
        match detection_p {
            Some(dp) => normexp_fit_detection_p(x, dp)?,
            None => bail!("no negative control probes found and no detection p-values supplied"),
        }
    };

    let (nprobe, narray) = x.dim();
    let mut out = Array2::<f64>::zeros((nprobe, narray));
    for j in 0..narray {
        let col: Vec<f64> = (0..nprobe).map(|g| x[[g, j]]).collect();
        let p = [par[[j, 0]], par[[j, 1]], par[[j, 2]]];
        let sig = normexp_signal(&p, &col);
        for g in 0..nprobe {
            out[[g, j]] = sig[g] + offset;
        }
    }
    Ok(out)
}

/// `neqc(x, status, ...)`: [`nec`] followed by quantile normalization across
/// arrays, `log2`, and restriction to the regular probes (when `status` is
/// given). Returns the `log2` normalized regular-probe matrix.
#[allow(clippy::too_many_arguments)]
pub fn neqc(
    x: &Array2<f64>,
    status: Option<&[String]>,
    negctrl: &str,
    regular: &str,
    offset: f64,
    robust: bool,
    detection_p: Option<&Array2<f64>>,
) -> Result<Array2<f64>> {
    let xbg = nec(x, status, negctrl, regular, offset, robust, detection_p)?;
    let qn = normalize_between_arrays(&xbg, NormalizeMethod::Quantile);
    let y = qn.mapv(|v| v.log2());
    match status {
        Some(s) => {
            let reg: Vec<usize> = (0..x.nrows())
                .filter(|&g| s[g].eq_ignore_ascii_case(regular))
                .collect();
            Ok(y.select(Axis(0), &reg))
        }
        None => Ok(y),
    }
}

#[cfg(test)]
#[allow(clippy::excessive_precision, clippy::approx_constant)]
mod tests {
    use super::*;
    use ndarray::Array2;

    fn rclose(a: f64, b: f64) -> bool {
        (a - b).abs() <= 1e-8 * (1.0 + b.abs())
    }

    /// Control-probe fixture: 12 probes x 4 arrays, first 4 negative controls.
    fn ctrl_fixture() -> (Array2<f64>, Vec<String>) {
        let mut x = Array2::<f64>::zeros((12, 4));
        for g0 in 0..12i64 {
            for j0 in 0..4i64 {
                let v = if g0 < 4 {
                    55 + g0 * 6 + j0 * 5 + ((g0 * 3 + j0 * 2) % 7)
                } else {
                    let r = g0 - 4;
                    180 + r * 55 + j0 * 12 + ((r * 5 + j0 * 3) % 9) * 7
                };
                x[[g0 as usize, j0 as usize]] = v as f64;
            }
        }
        let mut status: Vec<String> = vec!["negative".to_string(); 4];
        status.extend(vec!["regular".to_string(); 8]);
        (x, status)
    }

    #[test]
    fn normexp_fit_control_matches_r() {
        let (x, status) = ctrl_fixture();
        let par = normexp_fit_control(&x, &status, "negative", "regular", false).unwrap();
        let expect = [
            [66.75, 2.2168942846162354, 5.8103922097144594],
            [72.0, 2.1048277043665475, 5.8226760045966115],
            [77.25, 2.2168942846162354, 5.8575759478356177],
            [82.5, 1.8648507243170958, 5.8692969131337742],
        ];
        for j in 0..4 {
            for k in 0..3 {
                assert!(rclose(par[[j, k]], expect[j][k]), "par[{j},{k}]");
            }
        }
    }

    #[test]
    fn nec_control_matches_r() {
        let (x, status) = ctrl_fixture();
        let out = nec(&x, Some(&status), "negative", "regular", 16.0, false, None).unwrap();
        // Spot-check several entries from the R reference (12x4).
        assert!(rclose(out[[0, 0]], 20.305342016641642));
        assert!(rclose(out[[0, 3]], 19.357572919956016));
        assert!(rclose(out[[4, 0]], 128.99756554307118));
        assert!(rclose(out[[4, 2]], 184.50919971418364));
        assert!(rclose(out[[11, 0]], 569.99756554307112));
        assert!(rclose(out[[11, 3]], 590.38229755178907));
        assert!(rclose(out[[7, 1]], 300.8007152546553));
    }

    #[test]
    fn neqc_control_matches_r() {
        let (x, status) = ctrl_fixture();
        let y = neqc(&x, Some(&status), "negative", "regular", 16.0, false, None).unwrap();
        assert_eq!(y.dim(), (8, 4));
        let col0 = [
            7.2754023606686635,
            7.8402897759719394,
            8.087051426298979,
            8.3681678774159849,
            8.6033386841443473,
            8.8054938193940089,
            8.937145573861951,
            9.1406315482768701,
        ];
        for g in 0..8 {
            for j in 0..4 {
                assert!(rclose(y[[g, j]], col0[g]), "neqc[{g},{j}]");
            }
        }
    }

    #[test]
    fn huber_matches_mass() {
        let (x, _status) = ctrl_fixture();
        // log of column-0 negative controls (rows 0..4).
        let logneg: Vec<f64> = (0..4).map(|g| x[[g, 0]].ln()).collect();
        let (mu, s) = huber(&logneg, 1.5, 1e-6).unwrap();
        assert!(rclose(mu, 4.1968233644781909), "huber mu: {mu}");
        assert!(rclose(s, 0.11757390886994287), "huber s: {s}");
    }

    #[test]
    fn normexp_fit_control_robust_matches_r() {
        let (x, status) = ctrl_fixture();
        let par = normexp_fit_control(&x, &status, "negative", "regular", true).unwrap();
        let expect = [
            [66.935882025964389, 2.0665069321187759, 5.8098351046697276],
            [72.056377672121371, 2.0432833532410433, 5.8225091311190793],
            [77.313938618332429, 2.1404505355358392, 5.8573931841018823],
            [82.704902206490445, 2.0928195889897081, 5.868717925760583],
        ];
        for j in 0..4 {
            for k in 0..3 {
                assert!(rclose(par[[j, k]], expect[j][k]), "robust par[{j},{k}]");
            }
        }
    }

    #[test]
    fn nec_neqc_robust_matches_r() {
        let (x, status) = ctrl_fixture();
        let out = nec(&x, Some(&status), "negative", "regular", 16.0, true, None).unwrap();
        assert!(rclose(out[[0, 0]], 19.423283483241832));
        assert!(rclose(out[[0, 3]], 20.511023200579992));
        assert!(rclose(out[[4, 0]], 128.87715061732911));
        assert!(rclose(out[[4, 1]], 156.76738829317546));
        assert!(rclose(out[[11, 0]], 569.87715061732911));
        assert!(rclose(out[[11, 3]], 590.10929673355065));

        let y = neqc(&x, Some(&status), "negative", "regular", 16.0, true, None).unwrap();
        assert_eq!(y.dim(), (8, 4));
        let col = [
            7.2743390086104114,
            7.8395710271311723,
            8.0864456985233844,
            8.3676694099002535,
            8.6029152049207713,
            8.8051257166846142,
            8.9368095786763639,
            9.140339757863492,
        ];
        for g in 0..8 {
            for j in 0..4 {
                assert!(rclose(y[[g, j]], col[g]), "robust neqc[{g},{j}]");
            }
        }
    }

    /// Detection-p fixture: 8 probes x 3 arrays, all regular, distinct per-column
    /// p-values that decrease with expression (standard detection p-values).
    fn detp_fixture() -> (Array2<f64>, Array2<f64>) {
        let mut x = Array2::<f64>::zeros((8, 3));
        let mut dp = Array2::<f64>::zeros((8, 3));
        for g0 in 0..8i64 {
            for j0 in 0..3i64 {
                x[[g0 as usize, j0 as usize]] = (120 + g0 * 37 + j0 * 19) as f64;
                dp[[g0 as usize, j0 as usize]] =
                    (8 - g0) as f64 / 9.0 - j0 as f64 * 0.013 - g0 as f64 * 0.0007;
            }
        }
        (x, dp)
    }

    #[test]
    fn normexp_fit_detection_p_matches_r() {
        let (x, dp) = detp_fixture();
        let par = normexp_fit_detection_p(&x, &dp).unwrap();
        let expect = [
            [249.5, 4.3811404331177988, 2.3025850929940459],
            [268.5, 4.3811404331177988, 2.3025850929940459],
            [287.5, 4.3811404331177988, 2.3025850929940459],
        ];
        for j in 0..3 {
            for k in 0..3 {
                assert!(rclose(par[[j, k]], expect[j][k]), "par[{j},{k}]");
            }
        }
    }

    #[test]
    fn nec_detection_p_matches_r() {
        let (x, dp) = detp_fixture();
        let status = vec!["regular".to_string(); 8];
        let out = nec(
            &x,
            Some(&status),
            "negative",
            "regular",
            16.0,
            false,
            Some(&dp),
        )
        .unwrap();
        let col = [
            24.143687836126446,
            24.538051994415582,
            24.971626716787227,
            25.450412234664782,
            25.981663577672862,
            26.574226144777185,
            27.238981111286421,
            27.989443237658861,
        ];
        for g in 0..8 {
            for j in 0..3 {
                assert!(rclose(out[[g, j]], col[g]), "nec_dp[{g},{j}]");
            }
        }
    }
}