limma-rust 0.1.0

//! Gene-wise linear model fitting. Port of limma's `lmFit`
//! (`lm.series` / `nonEstimable`), least-squares path only.

use anyhow::{bail, Result};
use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis};

use crate::linalg::{eigh, inv_upper, matrix_rank, qr_econ, xtx_inv_from_r};

/// Result of fitting gene-wise linear models. Port of `MArrayLM`.
#[derive(Clone, Debug)]
pub struct MArrayLM {
    pub coefficients: Array2<f64>,     // n_genes x n_coef
    pub stdev_unscaled: Array2<f64>,   // n_genes x n_coef
    pub sigma: Array1<f64>,            // n_genes
    pub df_residual: Array1<f64>,      // n_genes
    pub cov_coefficients: Array2<f64>, // n_coef x n_coef
    pub gene_names: Vec<String>,
    pub coef_names: Vec<String>,
    pub amean: Array1<f64>, // n_genes (per-feature mean over samples, NaN-skipping)
    pub design: Option<Array2<f64>>,
    pub contrasts: Option<Array2<f64>>,

    // Filled by eBayes.
    pub df_prior: Option<Array1<f64>>,
    pub s2_prior: Option<Array1<f64>>, // len 1 (constant) or n_genes (trend)
    pub var_prior: Option<Array1<f64>>, // per coef
    pub proportion: Option<f64>,
    pub s2_post: Option<Array1<f64>>,
    pub t: Option<Array2<f64>>,
    pub df_total: Option<Array1<f64>>,
    pub p_value: Option<Array2<f64>>,
    pub lods: Option<Array2<f64>>,
    pub f_stat: Option<Array1<f64>>,
    pub f_p_value: Option<Array1<f64>>,
}

impl MArrayLM {
    pub fn n_genes(&self) -> usize {
        self.coefficients.nrows()
    }
    pub fn n_coef(&self) -> usize {
        self.coefficients.ncols()
    }

    /// `fitted.MArrayLM`: fitted values `coefficients %*% t(design)`
    /// (`n_genes x n_samples`).
    ///
    /// Errors if the fit holds contrasts (its coefficients are contrasts rather
    /// than the original coefficients) or carries no design matrix.
    pub fn fitted(&self) -> Result<Array2<f64>> {
        if self.contrasts.is_some() {
            bail!("fit contains contrasts rather than coefficients, so fitted values cannot be computed");
        }
        let design = match &self.design {
            Some(d) => d,
            None => bail!("fit has no design matrix, so fitted values cannot be computed"),
        };
        Ok(self.coefficients.dot(&design.t()))
    }

    /// `residuals.MArrayLM`: observed minus fitted values, `y - fitted`.
    pub fn residuals(&self, y: &Array2<f64>) -> Result<Array2<f64>> {
        let fitted = self.fitted()?;
        Ok(y - &fitted)
    }
}

/// NaN-skipping mean of each row (matches R `rowMeans(x, na.rm = TRUE)`).
fn row_nanmean(exprs: &Array2<f64>) -> Array1<f64> {
    let n = exprs.nrows();
    let mut out = Array1::<f64>::zeros(n);
    for i in 0..n {
        let mut sum = 0.0;
        let mut cnt = 0usize;
        for &v in exprs.row(i) {
            if v.is_finite() {
                sum += v;
                cnt += 1;
            }
        }
        out[i] = if cnt > 0 { sum / cnt as f64 } else { f64::NAN };
    }
    out
}

/// Return the indices of design columns that are linearly dependent on
/// previous columns, or `None` if the design has full column rank.
/// Port of `nonEstimable` (returns indices rather than reconstructed names).
pub fn non_estimable(design: &Array2<f64>) -> Option<Vec<usize>> {
    let p = design.ncols();
    let rank = matrix_rank(design);
    if rank < p {
        // Identify dependent columns greedily.
        let mut kept: Vec<usize> = Vec::new();
        let mut dependent: Vec<usize> = Vec::new();
        for j in 0..p {
            let mut trial: Vec<usize> = kept.clone();
            trial.push(j);
            let sub = design.select(Axis(1), &trial);
            if matrix_rank(&sub) == trial.len() {
                kept.push(j);
            } else {
                dependent.push(j);
            }
        }
        Some(dependent)
    } else {
        None
    }
}

/// Whether `x` has full column rank, judged (as in limma's `is.fullrank`) by the
/// ratio of the smallest to the largest eigenvalue of `x' x` exceeding `1e-13`.
pub fn is_fullrank(x: &Array2<f64>) -> bool {
    let (evals, _) = eigh(&x.t().dot(x));
    let n = evals.len();
    let largest = evals[n - 1];
    let smallest = evals[0];
    largest > 0.0 && (smallest / largest).abs() > 1e-13
}

/// Fit gene-wise linear models by ordinary least squares.
///
/// * `exprs` — `n_genes x n_samples` expression matrix (may contain NaN).
/// * `design` — `n_samples x n_coef` design matrix (no NaN).
///
/// Port of `lmFit` (method="ls", ndups=1, block=None) + `lm_series`.
pub fn lmfit(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    gene_names: Vec<String>,
    coef_names: Vec<String>,
) -> Result<MArrayLM> {
    let n_genes = exprs.nrows();
    let n_samples = exprs.ncols();
    let p = design.ncols();

    if n_genes == 0 {
        bail!("expression matrix has zero rows");
    }
    if design.nrows() != n_samples {
        bail!(
            "row dimension of design ({}) does not match column dimension of data ({})",
            design.nrows(),
            n_samples
        );
    }
    if design.iter().any(|v| v.is_nan()) {
        bail!("NAs not allowed in design matrix");
    }
    if matrix_rank(design) < p {
        bail!(
            "design matrix is not of full column rank ({} of {} columns estimable); \
             reduce the design to estimable coefficients",
            matrix_rank(design),
            p
        );
    }

    let amean = row_nanmean(exprs);
    let any_missing = exprs.iter().any(|v| !v.is_finite());

    let mut fit = if !any_missing {
        lm_series_fast(exprs, design, &gene_names, &coef_names)
    } else {
        lm_series_genewise(exprs, design, &gene_names, &coef_names)
    };

    fit.amean = amean;
    fit.design = Some(design.clone());
    Ok(fit)
}

/// Fit gene-wise linear models by *weighted* least squares.
///
/// * `exprs` — `n_genes x n_samples` expression matrix (may contain NaN).
/// * `design` — `n_samples x n_coef` design matrix (no NaN).
/// * `weights` — `n_genes x n_samples` observation weights (the `weights`
///   argument of `lmFit`, e.g. the matrix returned by `voom`).
///
/// Port of `lmFit(method="ls")` -> `lm.series` weighted gene-wise path. Each
/// gene is fit on its observations with finite expression and finite positive
/// weight; an observation with weight `<= 0` or non-finite is dropped, exactly
/// as R's `weights[weights<=0] <- NA; M[!is.finite(weights)] <- NA`. As in
/// limma, `cov.coefficients` is the *unweighted* `(XᵀX)⁻¹` of the full design.
pub fn lmfit_weighted(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    weights: &Array2<f64>,
    gene_names: Vec<String>,
    coef_names: Vec<String>,
) -> Result<MArrayLM> {
    let n_genes = exprs.nrows();
    let n_samples = exprs.ncols();
    let p = design.ncols();

    if n_genes == 0 {
        bail!("expression matrix has zero rows");
    }
    if design.nrows() != n_samples {
        bail!(
            "row dimension of design ({}) does not match column dimension of data ({})",
            design.nrows(),
            n_samples
        );
    }
    if weights.nrows() != n_genes || weights.ncols() != n_samples {
        bail!(
            "weights dimensions ({}x{}) must match expression matrix ({}x{})",
            weights.nrows(),
            weights.ncols(),
            n_genes,
            n_samples
        );
    }
    if design.iter().any(|v| v.is_nan()) {
        bail!("NAs not allowed in design matrix");
    }
    if matrix_rank(design) < p {
        bail!(
            "design matrix is not of full column rank ({} of {} columns estimable); \
             reduce the design to estimable coefficients",
            matrix_rank(design),
            p
        );
    }

    let amean = row_nanmean(exprs);
    let mut fit = lm_series_weighted(exprs, design, weights, &gene_names, &coef_names);
    fit.amean = amean;
    fit.design = Some(design.clone());
    Ok(fit)
}

/// Fast path: identical QR for every gene (no missing values, no weights).
fn lm_series_fast(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    gene_names: &[String],
    coef_names: &[String],
) -> MArrayLM {
    let n = design.nrows();
    let p = design.ncols();
    let n_genes = exprs.nrows();

    let (q, r) = qr_econ(design);
    let rinv = inv_upper(&r);
    let xtx_inv = xtx_inv_from_r(&r);

    // effects head = Q^T Y  (p x n_genes), Y = exprs^T (n_samples x n_genes)
    let yt = exprs.t().to_owned(); // n_samples x n_genes
    let qty = q.t().dot(&yt); // p x n_genes
                              // beta = R^{-1} Q^T y  (p x n_genes)
    let beta = rinv.dot(&qty); // p x n_genes
    let coefficients = beta.t().to_owned(); // n_genes x p

    let df_res = (n - p) as f64;
    let mut sigma = Array1::<f64>::zeros(n_genes);
    // Fitted sum of squares per gene = sum_j (Q^T y)[j, g]^2. `qty` is
    // p x n_genes (row-major), so accumulate row-wise — each `qty.row(j)` is
    // contiguous, whereas the per-gene `qty.column(g)` is strided. For a fixed
    // gene the j = 0..p terms still accumulate in the same order, so each
    // gene's value is bit-identical; only the cache-unfriendly stride is gone,
    // which is what stung the wide, many-gene unweighted regime.
    if df_res > 0.0 {
        let mut ss_fit = vec![0.0f64; n_genes];
        for j in 0..p {
            for (g, &v) in qty.row(j).iter().enumerate() {
                ss_fit[g] += v * v;
            }
        }
        for g in 0..n_genes {
            let ssy: f64 = exprs.row(g).iter().map(|&v| v * v).sum();
            let rss = (ssy - ss_fit[g]).max(0.0);
            sigma[g] = (rss / df_res).sqrt();
        }
    } else {
        sigma.fill(f64::NAN);
    }

    let diag_se: Array1<f64> = (0..p).map(|j| xtx_inv[[j, j]].sqrt()).collect();
    let mut stdev_unscaled = Array2::<f64>::zeros((n_genes, p));
    for g in 0..n_genes {
        for j in 0..p {
            stdev_unscaled[[g, j]] = diag_se[j];
        }
    }

    let df_residual = Array1::from_elem(n_genes, df_res);

    new_fit(
        coefficients,
        stdev_unscaled,
        sigma,
        df_residual,
        xtx_inv,
        gene_names,
        coef_names,
    )
}

/// Per-gene fit result, returned by the gene-wise solvers so the outer loop can
/// be mapped (optionally in parallel) and the rows scattered back into the
/// output matrices in gene order.
struct RowFit {
    coef: Vec<f64>,  // length n_coef (all NaN if the gene was skipped)
    stdev: Vec<f64>, // length n_coef (all NaN if the gene was skipped)
    sigma: f64,      // NaN if df_residual == 0 or the gene was skipped
    df: f64,         // 0.0 if the gene was skipped
}

impl RowFit {
    /// A gene with too few usable observations to estimate the coefficients:
    /// NaN estimates and zero residual df, matching the serial `continue` path.
    fn skipped(p: usize) -> Self {
        RowFit {
            coef: vec![f64::NAN; p],
            stdev: vec![f64::NAN; p],
            sigma: f64::NAN,
            df: 0.0,
        }
    }
}

/// Per-worker scratch for the gene-wise OLS/WLS paths: a reusable index list plus
/// full-size design/response buffers. Each gene fills the first `n_obs` rows
/// (`n_obs <= n_arrays`) and hands that region to `solve_scaled` as a view, so
/// the inner loop reuses three allocations across genes instead of making them
/// fresh every gene. The values placed in the buffers — and therefore the
/// result — are identical to the previous allocate-per-gene path.
struct GeneScratch {
    obs: Vec<usize>,
    xtil: Array2<f64>,
    ytil: Array1<f64>,
}

impl GeneScratch {
    fn new(n_arrays: usize, p: usize) -> Self {
        GeneScratch {
            obs: Vec::with_capacity(n_arrays),
            xtil: Array2::zeros((n_arrays, p)),
            ytil: Array1::zeros(n_arrays),
        }
    }
}

/// Map `f` over gene indices `0..n`, threading a per-worker mutable scratch `S`
/// (built by `init`) into each call. Under `parallel` this is rayon's
/// `map_init`, so each worker thread reuses one scratch across the genes it
/// handles; serially a single scratch is reused. Results are always returned in
/// gene order, so the output is bit-identical regardless of feature or thread
/// count — only the *across-gene* iteration is parallel.
fn map_genes_init<S, I, F>(n: usize, init: I, f: F) -> Vec<RowFit>
where
    S: Send,
    I: Fn() -> S + Sync + Send,
    F: Fn(&mut S, usize) -> RowFit + Sync + Send,
{
    #[cfg(feature = "parallel")]
    {
        use rayon::prelude::*;
        (0..n)
            .into_par_iter()
            .map_init(init, |s, g| f(s, g))
            .collect()
    }
    #[cfg(not(feature = "parallel"))]
    {
        let mut s = init();
        (0..n).map(|g| f(&mut s, g)).collect()
    }
}

/// Write per-gene fits into the output matrices (sequential, gene order).
fn scatter_rows(
    rows: Vec<RowFit>,
    coefficients: &mut Array2<f64>,
    stdev_unscaled: &mut Array2<f64>,
    sigma: &mut Array1<f64>,
    df_residual: &mut Array1<f64>,
) {
    let p = coefficients.ncols();
    for (g, r) in rows.into_iter().enumerate() {
        for j in 0..p {
            coefficients[[g, j]] = r.coef[j];
            stdev_unscaled[[g, j]] = r.stdev[j];
        }
        sigma[g] = r.sigma;
        df_residual[g] = r.df;
    }
}

/// Solve the (already `sqrt(w)`-scaled) least-squares system `X̃ β ≈ ỹ` for one
/// gene, returning `(coefficients, unscaled SEs, residual sum of squares)`, where
/// the unscaled SE of coefficient `j` is `sqrt(diag((X̃ᵀX̃)⁻¹))[j]`. This is the
/// hot inner kernel of the gene-wise (voom / missing-value) fits, so the `n×p`
/// QR — the part whose cost grows with sample count — is what we want fast.
///
/// With the `faer` feature the QR runs through faer's vectorised pure-Rust
/// kernel; without it the in-crate ndarray Householder QR is used. The two are
/// numerically equivalent (both match R to 8+ significant figures) but not
/// bit-identical, since the QR algorithms round differently (~1e-13). The tiny
/// `p×p` triangular work is shared via the in-crate [`inv_upper`].
#[cfg(feature = "faer")]
fn solve_scaled(xtil: ArrayView2<f64>, ytil: ArrayView1<f64>) -> (Vec<f64>, Vec<f64>, f64) {
    use faer::{prelude::*, Mat};

    let n_obs = xtil.nrows();
    let p = xtil.ncols();

    let xf = Mat::from_fn(n_obs, p, |i, j| xtil[[i, j]]);
    let yf = Mat::from_fn(n_obs, 1, |i, _| ytil[i]);
    let qr = xf.qr();
    let beta = qr.solve_lstsq(&yf); // p x 1
    let coef: Vec<f64> = (0..p).map(|j| beta[(j, 0)]).collect();

    // R factor (p×p, upper-triangular): (X̃ᵀX̃)⁻¹ = R⁻¹R⁻ᵀ, so the unscaled SEs
    // are the row norms of R⁻¹. Only the upper triangle is read.
    let rf = qr.thin_R();
    let mut r = Array2::<f64>::zeros((p, p));
    for i in 0..p {
        for j in i..p {
            r[[i, j]] = rf[(i, j)];
        }
    }
    let rinv = inv_upper(&r);
    let stdev: Vec<f64> = (0..p)
        .map(|j| {
            (0..p)
                .map(|k| rinv[[j, k]] * rinv[[j, k]])
                .sum::<f64>()
                .sqrt()
        })
        .collect();

    // RSS from the fitted residuals of the scaled system.
    let mut rss = 0.0_f64;
    for i in 0..n_obs {
        let mut fit = 0.0;
        for (j, &b) in coef.iter().enumerate() {
            fit += xtil[[i, j]] * b;
        }
        let e = ytil[i] - fit;
        rss += e * e;
    }
    (coef, stdev, rss.max(0.0))
}

/// Pure-ndarray fallback used when the `faer` feature is off. Bit-identical to
/// the gene-wise arithmetic shipped in earlier releases.
#[cfg(not(feature = "faer"))]
fn solve_scaled(xtil: ArrayView2<f64>, ytil: ArrayView1<f64>) -> (Vec<f64>, Vec<f64>, f64) {
    let p = xtil.ncols();
    // qr_econ wants an owned matrix; the per-gene `xtil` slice is small and this
    // fallback path already allocates Q/R inside qr_econ.
    let xq = xtil.to_owned();
    let (q, r) = qr_econ(&xq);
    let rinv = inv_upper(&r);
    let qty = q.t().dot(&ytil);
    let beta = rinv.dot(&qty);
    let xtx_inv = xtx_inv_from_r(&r);
    let coef: Vec<f64> = (0..p).map(|j| beta[j]).collect();
    let stdev: Vec<f64> = (0..p).map(|j| xtx_inv[[j, j]].sqrt()).collect();
    let ssy: f64 = ytil.iter().map(|&v| v * v).sum();
    let ss_fit: f64 = qty.iter().map(|&v| v * v).sum();
    let rss = (ssy - ss_fit).max(0.0);
    (coef, stdev, rss)
}

/// Ordinary least squares for one gene on its finite observations (missing-value
/// path).
fn ols_one_gene(
    scratch: &mut GeneScratch,
    row: ArrayView1<f64>,
    design: &Array2<f64>,
    p: usize,
) -> RowFit {
    let GeneScratch { obs, xtil, ytil } = scratch;
    obs.clear();
    obs.extend((0..row.len()).filter(|&i| row[i].is_finite()));
    if obs.is_empty() {
        return RowFit::skipped(p);
    }
    let n_obs = obs.len();
    if n_obs < p {
        return RowFit::skipped(p);
    }
    // Gather this gene's finite rows of (design, y) into the scratch, in `obs`
    // order — exactly the rows `design.select(Axis(0), &obs)` would have built.
    for (r, &i) in obs.iter().enumerate() {
        ytil[r] = row[i];
        for j in 0..p {
            xtil[[r, j]] = design[[i, j]];
        }
    }
    let (coef, stdev, rss) = solve_scaled(xtil.slice(s![..n_obs, ..]), ytil.slice(s![..n_obs]));
    let df = (n_obs - p) as f64;
    let sigma = if df > 0.0 {
        (rss / df).sqrt()
    } else {
        f64::NAN
    };
    RowFit {
        coef,
        stdev,
        sigma,
        df,
    }
}

/// Weighted least squares for one gene on its observations with finite
/// expression and finite positive weight (the `sqrt(w)`-scaled OLS trick).
fn wls_one_gene(
    scratch: &mut GeneScratch,
    yr: ArrayView1<f64>,
    wr: ArrayView1<f64>,
    design: &Array2<f64>,
    p: usize,
) -> RowFit {
    let GeneScratch { obs, xtil, ytil } = scratch;
    obs.clear();
    obs.extend((0..yr.len()).filter(|&k| yr[k].is_finite() && wr[k].is_finite() && wr[k] > 0.0));
    let n_obs = obs.len();
    if n_obs < p {
        return RowFit::skipped(p);
    }
    // Scale each observed row of (design, y) by sqrt(weight) into the scratch:
    // WLS becomes OLS on (X̃, ỹ), so the same `solve_scaled` kernel handles it.
    for (r, &k) in obs.iter().enumerate() {
        let sw = wr[k].sqrt();
        ytil[r] = sw * yr[k];
        for j in 0..p {
            xtil[[r, j]] = sw * design[[k, j]];
        }
    }
    let (coef, stdev, rss) = solve_scaled(xtil.slice(s![..n_obs, ..]), ytil.slice(s![..n_obs]));
    let df = (n_obs - p) as f64;
    let sigma = if df > 0.0 {
        (rss / df).sqrt()
    } else {
        f64::NAN
    };
    RowFit {
        coef,
        stdev,
        sigma,
        df,
    }
}

/// Slow path: genewise QR on the observed (finite) entries of each row.
fn lm_series_genewise(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    gene_names: &[String],
    coef_names: &[String],
) -> MArrayLM {
    let p = design.ncols();
    let n_genes = exprs.nrows();

    let mut coefficients = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
    let mut stdev_unscaled = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
    let mut sigma = Array1::<f64>::from_elem(n_genes, f64::NAN);
    let mut df_residual = Array1::<f64>::zeros(n_genes);

    let n_arrays = design.nrows();
    let rows = map_genes_init(
        n_genes,
        || GeneScratch::new(n_arrays, p),
        |s, g| ols_one_gene(s, exprs.row(g), design, p),
    );
    scatter_rows(
        rows,
        &mut coefficients,
        &mut stdev_unscaled,
        &mut sigma,
        &mut df_residual,
    );

    // cov_coefficients uses the full design (matches limma).
    let (_q, r) = qr_econ(design);
    let xtx_inv = xtx_inv_from_r(&r);

    new_fit(
        coefficients,
        stdev_unscaled,
        sigma,
        df_residual,
        xtx_inv,
        gene_names,
        coef_names,
    )
}

/// Weighted gene-wise path: per-gene QR on the `sqrt(w)`-scaled observations.
fn lm_series_weighted(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    weights: &Array2<f64>,
    gene_names: &[String],
    coef_names: &[String],
) -> MArrayLM {
    let p = design.ncols();
    let n_genes = exprs.nrows();

    let mut coefficients = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
    let mut stdev_unscaled = Array2::<f64>::from_elem((n_genes, p), f64::NAN);
    let mut sigma = Array1::<f64>::from_elem(n_genes, f64::NAN);
    let mut df_residual = Array1::<f64>::zeros(n_genes);

    let n_arrays = design.nrows();
    let rows = map_genes_init(
        n_genes,
        || GeneScratch::new(n_arrays, p),
        |s, g| wls_one_gene(s, exprs.row(g), weights.row(g), design, p),
    );
    scatter_rows(
        rows,
        &mut coefficients,
        &mut stdev_unscaled,
        &mut sigma,
        &mut df_residual,
    );

    // cov_coefficients uses the unweighted full design, matching lm.series.
    let (_q, r) = qr_econ(design);
    let xtx_inv = xtx_inv_from_r(&r);

    new_fit(
        coefficients,
        stdev_unscaled,
        sigma,
        df_residual,
        xtx_inv,
        gene_names,
        coef_names,
    )
}

#[allow(clippy::too_many_arguments)]
pub(crate) fn new_fit(
    coefficients: Array2<f64>,
    stdev_unscaled: Array2<f64>,
    sigma: Array1<f64>,
    df_residual: Array1<f64>,
    cov_coefficients: Array2<f64>,
    gene_names: &[String],
    coef_names: &[String],
) -> MArrayLM {
    let n_genes = coefficients.nrows();
    MArrayLM {
        coefficients,
        stdev_unscaled,
        sigma,
        df_residual,
        cov_coefficients,
        gene_names: gene_names.to_vec(),
        coef_names: coef_names.to_vec(),
        amean: Array1::zeros(n_genes),
        design: None,
        contrasts: None,
        df_prior: None,
        s2_prior: None,
        var_prior: None,
        proportion: None,
        s2_post: None,
        t: None,
        df_total: None,
        p_value: None,
        lods: None,
        f_stat: None,
        f_p_value: None,
    }
}

#[cfg(test)]
#[allow(clippy::excessive_precision)]
mod tests {
    use super::*;
    use ndarray::array;

    #[test]
    fn is_fullrank_matches_r() {
        // Reference: is.fullrank() in limma 3.68.3.
        let full = array![[1.0, 0.0], [1.0, 0.0], [1.0, 1.0]];
        let deficient = array![[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]];
        let single = array![[1.0], [2.0]];
        assert!(is_fullrank(&full));
        assert!(!is_fullrank(&deficient));
        assert!(is_fullrank(&single));
    }

    fn rclose(a: f64, b: f64) -> bool {
        (a - b).abs() <= 1e-7 * (1.0 + b.abs())
    }

    /// Rebuild scratch/lmfit_weighted_ref.R's 8x6 expression + weight matrices
    /// from the same purely rational, 0-indexed formula (bit-identical inputs).
    fn weighted_fixture() -> (Array2<f64>, Array2<f64>, Array2<f64>) {
        let ngenes = 8usize;
        let narrays = 6usize;
        let group = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
        let mut y = Array2::<f64>::zeros((ngenes, narrays));
        let mut w = Array2::<f64>::zeros((ngenes, narrays));
        for g in 0..ngenes {
            let gi = g as i64;
            let base = ((gi % 5) - 2) as f64;
            let eff = (((gi * 3) % 7) - 3) as f64;
            let mu = 6.0 + (gi % 4) as f64 * 0.5;
            for k in 0..narrays {
                let ki = k as i64;
                let noise = (((gi * 13 + ki * 17) % 19) - 9) as f64 * 0.05;
                y[[g, k]] = mu + eff * 0.3 * group[k] + base * 0.1 + noise;
                w[[g, k]] = 0.5 + (((gi * 7 + ki * 5) % 11) as f64) * 0.1;
            }
        }
        let mut design = Array2::<f64>::zeros((narrays, 2));
        for k in 0..narrays {
            design[[k, 0]] = 1.0;
            design[[k, 1]] = group[k];
        }
        (y, design, w)
    }

    fn names(n: usize, p: usize) -> (Vec<String>, Vec<String>) {
        ((0..n).map(|i| format!("g{i}")).collect(), {
            let mut c = vec!["Intercept".to_string()];
            for j in 1..p {
                c.push(format!("x{j}"));
            }
            c
        })
    }

    #[test]
    fn lmfit_weighted_matches_r() {
        let (y, design, w) = weighted_fixture();
        let (gn, cn) = names(8, 2);
        let fit = lmfit_weighted(&y, &design, &w, gn, cn).unwrap();

        // optim/lmFit(Y, design, weights=W) reference from limma 3.68.3.
        let coef = [
            [6.0083333333333346, -1.0051075268817207],
            [6.5034482758620706, -0.33678160919540306],
            [6.8035714285714244, 1.1567733990147793],
            [7.677631578947369, -0.22406015037593929],
            [6.3538461538461544, 0.29878542510121447],
            [6.1539999999999981, -0.534769230769229],
            [6.8057142857142869, 0.59828571428571331],
            [7.7029411764705884, -1.200084033613446],
        ];
        let stdev = [
            [0.57735026918962584, 0.80988516376991593],
            [0.58722021951470349, 0.8235052638205963],
            [0.59761430466719667, 0.83783676414308383],
            [0.51298917604257699, 0.78759174188135006],
            [0.62017367294604198, 0.80484363658553359],
            [0.63245553203367566, 0.88578517972214033],
            [0.5345224838248489, 0.82807867121082501],
            [0.54232614454664041, 0.7614669610515673],
        ];
        let sigma = [
            0.26599314305172844,
            0.099539168054648686,
            0.33639780799178071,
            0.38269209879641242,
            0.11173095214852602,
            0.31304890254497836,
            0.36171318550949705,
            0.10717044462761564,
        ];
        for g in 0..8 {
            for j in 0..2 {
                assert!(
                    rclose(fit.coefficients[[g, j]], coef[g][j]),
                    "coef[{g}][{j}] {}",
                    fit.coefficients[[g, j]]
                );
                assert!(
                    rclose(fit.stdev_unscaled[[g, j]], stdev[g][j]),
                    "stdev[{g}][{j}] {}",
                    fit.stdev_unscaled[[g, j]]
                );
            }
            assert!(
                rclose(fit.sigma[g], sigma[g]),
                "sigma[{g}] {}",
                fit.sigma[g]
            );
            assert_eq!(fit.df_residual[g], 4.0);
        }
        // cov.coefficients is the unweighted (XᵀX)⁻¹.
        assert!(rclose(fit.cov_coefficients[[0, 0]], 0.33333333333333348));
        assert!(rclose(fit.cov_coefficients[[0, 1]], -0.33333333333333354));
        assert!(rclose(fit.cov_coefficients[[1, 1]], 0.66666666666666685));
    }

    #[test]
    fn lmfit_weighted_drops_zero_weight_and_missing() {
        let (mut y, design, mut w) = weighted_fixture();
        w[[3, 4]] = 0.0; // zero weight -> observation dropped
        y[[5, 2]] = f64::NAN; // missing expression -> observation dropped
        let (gn, cn) = names(8, 2);
        let fit = lmfit_weighted(&y, &design, &w, gn, cn).unwrap();

        // Reference (lmFit with W[4,5]=0 and Y[6,3]=NA, 1-indexed): only genes
        // 4 and 6 (0-indexed 3 and 5) lose one observation -> df 4 becomes 3.
        let df = [4.0, 4.0, 4.0, 3.0, 4.0, 3.0, 4.0, 4.0];
        for (g, &expect) in df.iter().enumerate() {
            assert_eq!(fit.df_residual[g], expect, "df[{g}]");
        }
        assert!(rclose(fit.coefficients[[3, 0]], 7.6776315789473708));
        assert!(rclose(fit.coefficients[[3, 1]], -0.22096491228070209));
        assert!(rclose(fit.coefficients[[5, 0]], 6.1868421052631613));
        assert!(rclose(fit.coefficients[[5, 1]], -0.56761133603239));
        assert!(
            rclose(fit.sigma[3], 0.44188309824502131),
            "sigma3 {}",
            fit.sigma[3]
        );
        assert!(
            rclose(fit.sigma[5], 0.35751900376998208),
            "sigma5 {}",
            fit.sigma[5]
        );
        assert!(rclose(fit.stdev_unscaled[[3, 1]], 0.96427411113412587));
        assert!(rclose(fit.stdev_unscaled[[5, 0]], 0.72547625011001193));
        // Unaffected genes still match the clean fit.
        assert!(rclose(fit.coefficients[[0, 1]], -1.0051075268817207));
        assert!(rclose(fit.sigma[7], 0.10717044462761564));
    }

    #[test]
    fn fitted_and_residuals_match_r() {
        // Reference: scratch/fitted_resid_ref.R (lmFit -> fitted/residuals).
        let group = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
        let mut y = Array2::<f64>::zeros((5, 6));
        for g0 in 0..5 {
            let lvl = 5.0 + (g0 % 4) as f64 * 0.3;
            let eff = (((g0 as i64 * 3) % 7) - 3) as f64 * 0.2;
            for k0 in 0..6 {
                let noise = (((g0 as i64 * 7 + k0 as i64 * 13) % 11) - 5) as f64 * 0.05;
                y[[g0, k0]] = lvl + eff * group[k0] + noise;
            }
        }
        let mut design = Array2::<f64>::zeros((6, 2));
        for k0 in 0..6 {
            design[[k0, 0]] = 1.0;
            design[[k0, 1]] = group[k0];
        }
        let (gn, cn) = (
            (0..5).map(|i| format!("g{i}")).collect::<Vec<_>>(),
            vec!["Intercept".into(), "group".into()],
        );
        let fit = lmfit(&y, &design, gn, cn).unwrap();

        let fitted = fit.fitted().unwrap();
        assert_eq!(fitted.dim(), (5, 6));
        assert!(rclose(fitted[[0, 0]], 4.8500000000000014));
        assert!(rclose(fitted[[0, 3]], 4.5500000000000007));
        assert!(rclose(fitted[[4, 0]], 5.1499999999999995));
        assert!(rclose(fitted[[4, 3]], 5.2999999999999998));
        // Constant within each group (intercept + group effect only).
        assert!(rclose(fitted[[0, 1]], fitted[[0, 0]]));
        assert!(rclose(fitted[[0, 5]], fitted[[0, 3]]));

        let resid = fit.residuals(&y).unwrap();
        assert_eq!(resid.dim(), (5, 6));
        assert!(rclose(resid[[0, 0]], -0.10000000000000142));
        assert!(rclose(resid[[0, 2]], 0.099999999999998757));
        assert!(rclose(resid[[4, 2]], 0.10000000000000053));
        // y == fitted + residuals.
        for g in 0..5 {
            for k in 0..6 {
                assert!(rclose(y[[g, k]], fitted[[g, k]] + resid[[g, k]]));
            }
        }
    }
}