limma-rust 0.1.0

//! `mrlm` (lmfit.R): robust gene-wise regression, the engine `lmFit(method =
//! "robust")` dispatches to. Each gene is fit by `MASS::rlm`'s iteratively
//! reweighted least squares (IRLS), reproducing MASS 7.3.65's `rlm.default`
//! defaults exactly: `init = "ls"`, `psi = psi.huber` (k = 1.345),
//! `scale.est = "MAD"`, `wt.method = "inv.var"`, `maxit = 20`, `acc = 1e-4`,
//! `test.vec = "resid"`.
//!
//! Per gene (limma drops non-finite observations first):
//!   1. prior weights are folded in as `x <- x*sqrt(w)`, `y <- y*sqrt(w)`
//!      (the inv.var convention) so the IRLS loop itself runs unweighted;
//!   2. an ordinary least-squares fit seeds `coef`/`resid`;
//!   3. each iteration sets `scale = median(|resid|)/0.6745`, Huber weights
//!      `w = min(1, 1.345*scale/|resid|)`, refits by weighted least squares,
//!      and stops once `sqrt(sum((resid_old-resid)^2)/sum(resid_old^2)) <= 1e-4`.
//!
//! Outputs mirror `mrlm`: `coefficients = coef`, `sigma = s` (the scale of the
//! converging iteration), `stdev.unscaled = sqrt(diag((Xᵀ W X)⁻¹))` with the
//! final psi×prior weights, `df.residual = nobs - rank`, and a single
//! `cov.coefficients = (Xᵀ X)⁻¹` from the unweighted full design.
//!
//! Not reproduced (limma always calls rlm with defaults, so these branches are
//! unreachable through `lmFit`): `wt.method = "case"`, MM-estimation, and any
//! non-default psi / scale.est; also per-gene rank deficiency, since `rlm`
//! itself stops on a singular design.

use anyhow::{bail, Result};
use ndarray::{Array1, Array2};

use crate::dups::{kron_rows, unwrapdups};
use crate::fit::{new_fit, MArrayLM};
use crate::linalg::{cholesky_upper, solve_upper, solve_upper_transpose, xtx_inv_from_r};

const HUBER_K: f64 = 1.345;
const MAD_DENOM: f64 = 0.6745;
const ACC: f64 = 1e-4;
const MAXIT: usize = 20;

/// NaN-skipping mean of an iterator of values; `NaN` when nothing is finite.
fn nanmean<'a, I: IntoIterator<Item = &'a f64>>(it: I) -> f64 {
    let mut sum = 0.0;
    let mut cnt = 0usize;
    for &v in it {
        if v.is_finite() {
            sum += v;
            cnt += 1;
        }
    }
    if cnt > 0 {
        sum / cnt as f64
    } else {
        f64::NAN
    }
}

/// `Amean` exactly as `lmFit` populates it: row means of the original
/// expression matrix, reduced through `unwrapdups` when `ndups > 1`.
fn mrlm_amean(exprs: &Array2<f64>, ndups: usize, spacing: usize) -> Array1<f64> {
    let nspots = exprs.nrows();
    let spotmean: Vec<f64> = (0..nspots).map(|i| nanmean(exprs.row(i))).collect();
    if ndups <= 1 {
        return Array1::from(spotmean);
    }
    let col = Array2::from_shape_vec((nspots, 1), spotmean).expect("column shape");
    let uw = unwrapdups(&col, ndups, spacing.max(1));
    (0..uw.nrows()).map(|g| nanmean(uw.row(g))).collect()
}

/// `Xᵀ diag(w) X` (`p x p`).
fn xtwx(x: &Array2<f64>, w: &Array1<f64>) -> Array2<f64> {
    let (n, p) = x.dim();
    let mut a = Array2::<f64>::zeros((p, p));
    for i in 0..n {
        let wi = w[i];
        for j in 0..p {
            let xij = x[[i, j]] * wi;
            for l in 0..p {
                a[[j, l]] += xij * x[[i, l]];
            }
        }
    }
    a
}

/// Weighted least squares via the normal equations (Cholesky). Returns the
/// coefficients and the *raw* residuals `y - X coef`, matching `lm.wfit`, whose
/// `$residuals` are unscaled by the weights.
fn wls(x: &Array2<f64>, y: &Array1<f64>, w: &Array1<f64>) -> (Array1<f64>, Array1<f64>) {
    let (n, p) = x.dim();
    let a = xtwx(x, w);
    let mut b = Array1::<f64>::zeros(p);
    for i in 0..n {
        let wy = w[i] * y[i];
        for j in 0..p {
            b[j] += x[[i, j]] * wy;
        }
    }
    let u = cholesky_upper(&a);
    let z = solve_upper_transpose(&u, &b);
    let coef = solve_upper(&u, &z);
    let resid = y - &x.dot(&coef);
    (coef, resid)
}

/// `irls.delta(old, new) = sqrt(sum((old-new)^2)/max(1e-20, sum(old^2)))`.
fn irls_delta(old: &Array1<f64>, new: &Array1<f64>) -> f64 {
    let num: f64 = old
        .iter()
        .zip(new.iter())
        .map(|(&o, &n)| (o - n) * (o - n))
        .sum();
    let den: f64 = old.iter().map(|&o| o * o).sum::<f64>().max(1e-20);
    (num / den).sqrt()
}

/// `median(|x|)` with R's even-length convention (mean of the two middle order
/// statistics). Inputs are finite, so the comparison is total.
fn median_abs(r: &Array1<f64>) -> f64 {
    let mut v: Vec<f64> = r.iter().map(|x| x.abs()).collect();
    v.sort_by(|a, b| a.partial_cmp(b).expect("finite residuals"));
    let n = v.len();
    if n % 2 == 1 {
        v[n / 2]
    } else {
        (v[n / 2 - 1] + v[n / 2]) * 0.5
    }
}

/// One gene's robust fit (`MASS::rlm.default` with limma's defaults). `x` is the
/// observed-row design (`nobs x p`, full rank), `y` the observed responses, and
/// `wprior` the per-observation prior weights (all `> 0`). Returns the
/// coefficients, `stdev.unscaled` row `sqrt(diag((Xᵀ W X)⁻¹))`, and the scale
/// `s` of the converging iteration.
fn rlm_fit(
    x: &Array2<f64>,
    y: &Array1<f64>,
    wprior: &Array1<f64>,
) -> (Array1<f64>, Array1<f64>, f64) {
    let (n, p) = x.dim();

    // Fold prior weights (inv.var): scale rows of x and entries of y by sqrt(w),
    // after which the IRLS proceeds as if unweighted.
    let mut xf = x.clone();
    let mut yf = y.clone();
    for i in 0..n {
        let fac = wprior[i].sqrt();
        for j in 0..p {
            xf[[i, j]] *= fac;
        }
        yf[i] *= fac;
    }

    // init = "ls": ordinary least squares seeds coef/resid.
    let ones = Array1::<f64>::ones(n);
    let (mut coef, mut resid) = wls(&xf, &yf, &ones);

    let mut scale = 0.0;
    let mut wpsi = ones.clone();
    for _ in 0..MAXIT {
        let testpv = resid.clone();
        scale = median_abs(&resid) / MAD_DENOM;
        if scale == 0.0 {
            break;
        }
        // psi.huber(resid/scale) = min(1, k/|resid/scale|); |resid| == 0 -> 1.
        for i in 0..n {
            let u = resid[i] / scale;
            wpsi[i] = (HUBER_K / u.abs()).min(1.0);
        }
        let (c, r) = wls(&xf, &yf, &wpsi);
        coef = c;
        resid = r;
        if irls_delta(&testpv, &resid) <= ACC {
            break;
        }
    }

    // stdev.unscaled from the final weighted design: sqrt(diag((Xᵀ W X)⁻¹)),
    // matching sqrt(diag(chol2inv(out$qr$qr))) where the QR carries psi×prior.
    let xtwx_final = xtwx(&xf, &wpsi);
    let u = cholesky_upper(&xtwx_final);
    let inv = xtx_inv_from_r(&u);
    let stdev: Array1<f64> = (0..p).map(|j| inv[[j, j]].sqrt()).collect();

    (coef, stdev, scale)
}

/// Fit gene-wise robust linear models. Port of limma's `mrlm`, the
/// `lmFit(method = "robust")` path. `weights`, when supplied, is a full
/// probe-weight matrix matching `exprs` (non-positive weights drop the
/// corresponding observation, exactly as limma masks them to `NA`).
#[allow(clippy::too_many_arguments)]
pub fn mrlm(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    ndups: usize,
    spacing: usize,
    weights: Option<&Array2<f64>>,
    gene_names: Vec<String>,
    coef_names: Vec<String>,
) -> Result<MArrayLM> {
    let narrays0 = exprs.ncols();
    if design.nrows() != narrays0 {
        bail!(
            "design rows ({}) must match number of arrays ({})",
            design.nrows(),
            narrays0
        );
    }
    let nbeta = design.ncols();

    // Amean reflects the original data, before weight-driven NA masking below.
    let amean = mrlm_amean(exprs, ndups, spacing);

    // Clean weights: w <= 0 -> NA, then mask M wherever the weight is not finite
    // (limma: weights[weights<=0] <- NA; M[!is.finite(weights)] <- NA).
    let mut m = exprs.clone();
    let mut wmat: Option<Array2<f64>> = None;
    if let Some(w) = weights {
        if w.dim() != exprs.dim() {
            bail!("weights dimensions must match exprs");
        }
        let mut wc = w.clone();
        for i in 0..wc.nrows() {
            for j in 0..wc.ncols() {
                if wc[[i, j]] <= 0.0 {
                    wc[[i, j]] = f64::NAN;
                }
                if !wc[[i, j]].is_finite() {
                    m[[i, j]] = f64::NAN;
                }
            }
        }
        wmat = Some(wc);
    }

    // Duplicate spots: fold ndups columns side by side and replicate the design.
    let (mwork, wwork, dwork) = if ndups > 1 {
        let sp = spacing.max(1);
        let mu = unwrapdups(&m, ndups, sp);
        let wu = wmat.as_ref().map(|w| unwrapdups(w, ndups, sp));
        let du = kron_rows(design, ndups);
        (mu, wu, du)
    } else {
        (m, wmat, design.clone())
    };

    let ngenes = mwork.nrows();
    let n = mwork.ncols();

    let mut coefficients = Array2::<f64>::from_elem((ngenes, nbeta), f64::NAN);
    let mut stdev_unscaled = Array2::<f64>::from_elem((ngenes, nbeta), f64::NAN);
    let mut sigma = Array1::<f64>::from_elem(ngenes, f64::NAN);
    let mut df_residual = Array1::<f64>::zeros(ngenes);

    for g in 0..ngenes {
        let yrow = mwork.row(g);
        let obs: Vec<usize> = (0..n).filter(|&k| yrow[k].is_finite()).collect();
        let nobs = obs.len();
        // rlm needs more observations than coefficients (limma: length(y) > nbeta).
        if nobs <= nbeta {
            continue;
        }
        let mut x = Array2::<f64>::zeros((nobs, nbeta));
        let mut y = Array1::<f64>::zeros(nobs);
        let mut wp = Array1::<f64>::ones(nobs);
        for (ri, &k) in obs.iter().enumerate() {
            for j in 0..nbeta {
                x[[ri, j]] = dwork[[k, j]];
            }
            y[ri] = yrow[k];
            if let Some(w) = &wwork {
                wp[ri] = w[[g, k]];
            }
        }
        let (coef, stdev, s) = rlm_fit(&x, &y, &wp);
        for j in 0..nbeta {
            coefficients[[g, j]] = coef[j];
            stdev_unscaled[[g, j]] = stdev[j];
        }
        let df = (nobs - nbeta) as f64;
        df_residual[g] = df;
        if df > 0.0 {
            sigma[g] = s;
        }
    }

    // cov.coefficients = (Xᵀ X)⁻¹ of the unweighted full (replicated) design.
    let xtx = xtwx(&dwork, &Array1::<f64>::ones(dwork.nrows()));
    let u = cholesky_upper(&xtx);
    let cov_coefficients = xtx_inv_from_r(&u);

    let gnames: Vec<String> = if gene_names.len() == ngenes {
        gene_names
    } else {
        (0..ngenes).map(|i| i.to_string()).collect()
    };
    let cnames: Vec<String> = if coef_names.len() == nbeta {
        coef_names
    } else {
        (0..nbeta).map(|j| format!("coef{j}")).collect()
    };

    let mut fit = new_fit(
        coefficients,
        stdev_unscaled,
        sigma,
        df_residual,
        cov_coefficients,
        &gnames,
        &cnames,
    );
    fit.amean = amean;
    fit.design = Some(design.clone());
    Ok(fit)
}

#[cfg(test)]
#[allow(clippy::excessive_precision)]
mod tests {
    use super::*;

    fn rclose(a: f64, b: f64) -> bool {
        (a - b).abs() <= 1e-6 * (1.0 + b.abs())
    }

    fn names(n: usize, p: usize) -> (Vec<String>, Vec<String>) {
        (
            (0..n).map(|i| format!("g{i}")).collect(),
            (0..p).map(|j| format!("c{j}")).collect(),
        )
    }

    const GROUP: [f64; 8] = [0., 0., 0., 0., 1., 1., 1., 1.];

    /// Rational fixture mirroring scratch/mrlm_ref.R: 6 genes x 8 arrays with two
    /// deliberate outliers so the robust fit departs from OLS, plus a prior-weight
    /// matrix. Built from integer modular arithmetic for bit-identical inputs.
    fn mrlm_fixture() -> (Array2<f64>, Array2<f64>) {
        let mut m = Array2::<f64>::zeros((6, 8));
        let mut pw = Array2::<f64>::zeros((6, 8));
        for g0 in 0..6i64 {
            let lvl = 5.0 + (g0 % 5) as f64 * 0.25;
            let eff = (((g0 * 3) % 7) - 3) as f64 * 0.3;
            for k0 in 0..8i64 {
                let noise = (((g0 * 13 + k0 * 7) % 11) - 5) as f64 * 0.04;
                let mut val = lvl + eff * GROUP[k0 as usize] + noise;
                if g0 == 1 && k0 == 2 {
                    val += 4.5;
                }
                if g0 == 4 && k0 == 5 {
                    val -= 3.0;
                }
                m[[g0 as usize, k0 as usize]] = val;
                pw[[g0 as usize, k0 as usize]] = 0.5 + ((g0 * 3 + k0 * 2) % 7) as f64 * 0.1;
            }
        }
        (m, pw)
    }

    fn mrlm_design() -> Array2<f64> {
        let mut d = Array2::<f64>::zeros((8, 2));
        for k0 in 0..8 {
            d[[k0, 0]] = 1.0;
            d[[k0, 1]] = GROUP[k0];
        }
        d
    }

    #[test]
    fn mrlm_no_weights_outliers_matches_r() {
        let (m, _) = mrlm_fixture();
        let (gn, cn) = names(6, 2);
        let fit = mrlm(&m, &mrlm_design(), 1, 1, None, gn, cn).unwrap();

        let coef = [
            [4.9999999999999991, -0.87999999999999978],
            [5.3146141629547916, -0.074614162954790883],
            [5.4399999999999995, 1.0300000000000002],
            [5.7700000000000014, -0.39000000000000001],
            [5.9900000000000002, 0.48355048621571056],
            [5.0700000000000003, -0.68999999999999984],
        ];
        let stdev = [
            [0.49999999999999989, 0.70710678118654746],
            [0.57065962792466218, 0.75871760948531697],
            [0.49999999999999989, 0.70710678118654746],
            [0.49999999999999989, 0.70710678118654746],
            [0.49999999999999994, 0.75793140221465416],
            [0.49999999999999989, 0.70710678118654746],
        ];
        let sig = [
            0.16308376575240946,
            0.23333219796492946,
            0.16308376575240935,
            0.16308376575240935,
            0.16308376575240946,
            0.1630837657524094,
        ];
        let amean = [4.56, 5.7925, 5.955, 5.575, 5.925, 4.725];
        for g in 0..6 {
            assert!(rclose(fit.coefficients[[g, 0]], coef[g][0]), "coef0 g{g}");
            assert!(rclose(fit.coefficients[[g, 1]], coef[g][1]), "coef1 g{g}");
            assert!(rclose(fit.stdev_unscaled[[g, 0]], stdev[g][0]), "sd0 g{g}");
            assert!(rclose(fit.stdev_unscaled[[g, 1]], stdev[g][1]), "sd1 g{g}");
            assert!(rclose(fit.sigma[g], sig[g]), "sigma g{g}");
            assert!(rclose(fit.amean[g], amean[g]), "amean g{g}");
            assert_eq!(fit.df_residual[g], 6.0);
        }
        assert!(rclose(fit.cov_coefficients[[0, 0]], 0.25));
        assert!(rclose(fit.cov_coefficients[[0, 1]], -0.25));
        assert!(rclose(fit.cov_coefficients[[1, 1]], 0.5));
    }

    #[test]
    fn mrlm_prior_weights_matches_r() {
        let (m, pw) = mrlm_fixture();
        let (gn, cn) = names(6, 2);
        let fit = mrlm(&m, &mrlm_design(), 1, 1, Some(&pw), gn, cn).unwrap();

        let coef = [
            [5.0325000000000006, -0.90215517241379395],
            [5.3064763689357903, -0.054123427759319734],
            [5.4485714285714275, 1.0126785714285726],
            [5.7790909090909093, -0.38509090909090937],
            [6.0108476244986209, 0.48188034709807304],
            [5.0386206896551746, -0.64952978056426403],
        ];
        let stdev = [
            [0.55901699437494745, 0.81075741514148147],
            [0.62713777647068802, 0.82910761529214172],
            [0.53452248382484879, 0.77344313670384701],
            [0.55048188256318031, 0.79772403521746571],
            [0.56867031498498866, 0.81384867745376954],
            [0.5872202195147036, 0.80489619780267296],
        ];
        let sig = [
            0.1752276081871578,
            0.19892640030923783,
            0.14950403235466014,
            0.15826499012964992,
            0.12966195875949166,
            0.15628762766357865,
        ];
        for g in 0..6 {
            assert!(rclose(fit.coefficients[[g, 0]], coef[g][0]), "coef0 g{g}");
            assert!(rclose(fit.coefficients[[g, 1]], coef[g][1]), "coef1 g{g}");
            assert!(rclose(fit.stdev_unscaled[[g, 0]], stdev[g][0]), "sd0 g{g}");
            assert!(rclose(fit.stdev_unscaled[[g, 1]], stdev[g][1]), "sd1 g{g}");
            assert!(rclose(fit.sigma[g], sig[g]), "sigma g{g}");
            assert_eq!(fit.df_residual[g], 6.0);
        }
        // cov.coefficients is the unweighted full design, identical to scenario 1.
        assert!(rclose(fit.cov_coefficients[[0, 0]], 0.25));
        assert!(rclose(fit.cov_coefficients[[1, 1]], 0.5));
    }

    #[test]
    fn mrlm_one_na_drops_observation_matches_r() {
        let (mut m, _) = mrlm_fixture();
        m[[2, 5]] = f64::NAN; // 0-indexed gene 2, array 5
        let (gn, cn) = names(6, 2);
        let fit = mrlm(&m, &mrlm_design(), 1, 1, None, gn, cn).unwrap();

        // Affected gene 2: one observation dropped (df 6 -> 5), refit.
        assert_eq!(fit.df_residual[2], 5.0);
        assert!(rclose(fit.coefficients[[2, 0]], 5.4399999999999986));
        assert!(rclose(fit.coefficients[[2, 1]], 1.0400000000000007));
        assert!(rclose(fit.stdev_unscaled[[2, 0]], 0.49999999999999989));
        assert!(rclose(fit.stdev_unscaled[[2, 1]], 0.76376261582597327));
        assert!(rclose(fit.sigma[2], 0.17790956263899199));
        assert!(rclose(fit.amean[2], 5.8857142857142861));

        // Untouched genes match scenario 1 exactly.
        assert_eq!(fit.df_residual[0], 6.0);
        assert!(rclose(fit.coefficients[[1, 0]], 5.3146141629547916));
        assert!(rclose(fit.coefficients[[1, 1]], -0.074614162954790883));
        assert!(rclose(fit.sigma[5], 0.1630837657524094));
    }
}