limma-rust 0.1.0

//! Competitive gene-set tests: the deterministic, rank-based members of
//! limma's gene-set family.
//!
//! Ported here:
//! * [`rank_sum_test_with_correlation`] — Wilcoxon–Mann–Whitney rank-sum test
//!   adjusted for inter-gene correlation (`rankSumTestWithCorrelation`).
//! * [`gene_set_test`] / [`wilcox_gst`] — competitive set test from gene ranks
//!   (`geneSetTest(..., ranks.only=TRUE)` / `wilcoxGST`). The permutation path
//!   (`ranks.only=FALSE`) is RNG-dependent and not ported.
//! * [`camera_pr`] — pre-ranked competitive test with inter-gene-correlation
//!   correction (`cameraPR`, `directional=TRUE`, fixed scalar correlation).
//! * [`camera`] — competitive test from an expression matrix and design,
//!   computing the moderated-t statistics internally (`camera`/`camera.default`,
//!   fixed `inter.gene.cor`).
//! * [`inter_gene_correlation`] — variance-inflation factor and mean inter-gene
//!   correlation of the residuals (`interGeneCorrelation`).
//! * [`fry`] — fast approximation to `roast` (the `nrot = Inf`, `prior.df = Inf`
//!   limit), giving deterministic directional and mixed p-values (`fry`).
//! * [`roast`] — self-contained rotation gene-set test (`roast`), Monte-Carlo
//!   over random rotations of the residual space. Reproduces limma's p-values
//!   bit-for-bit via the [`RRng`] port of R's Mersenne-Twister.
//! * [`ids2indices`] — map gene-set identifier lists to row indices.

use std::collections::HashSet;

use anyhow::{bail, Result};
use ndarray::{Array1, Array2};
use statrs::distribution::{Beta, ContinuousCDF, Normal, StudentsT};

use crate::ebayes::{fit_fdist, squeeze_var, squeeze_var_post, tmixture_vector};
use crate::linalg::{eigh, matrix_rank, qr_econ, qr_full_q};
use crate::proptruenull::{prop_true_null, PropTrueNullMethod};
use crate::rng::RRng;
use crate::special::gauss_legendre_01;
use crate::zscore::{zscore_t, ZscoreTMethod};

/// Average ranks (R's `rank(x, ties.method="average")`), 1-based, ascending.
fn rank_average(x: &[f64]) -> Vec<f64> {
    let n = x.len();
    let mut idx: Vec<usize> = (0..n).collect();
    idx.sort_by(|&a, &b| x[a].partial_cmp(&x[b]).unwrap());
    let mut ranks = vec![0.0; n];
    let mut i = 0;
    while i < n {
        let mut j = i;
        while j + 1 < n && x[idx[j + 1]] == x[idx[i]] {
            j += 1;
        }
        // Sorted positions i..=j occupy ranks (i+1)..=(j+1); assign their mean.
        let avg = ((i + 1 + j + 1) as f64) / 2.0;
        for &k in &idx[i..=j] {
            ranks[k] = avg;
        }
        i = j + 1;
    }
    ranks
}

/// Average ranks together with the sizes of exactly-equal-value groups, sharing
/// a single sort. Equivalent to running [`rank_average`] and the old separate
/// `tie_group_sizes` back to back, but sorts once instead of twice. Used by the
/// correlation-adjusted rank-sum path; [`camera_pr`] computes it once and reuses
/// it across every set, since both outputs depend only on the shared statistic
/// vector (not on which genes are in a given set).
fn rank_average_and_ties(x: &[f64]) -> (Vec<f64>, Vec<usize>) {
    let n = x.len();
    let mut idx: Vec<usize> = (0..n).collect();
    idx.sort_by(|&a, &b| x[a].partial_cmp(&x[b]).unwrap());
    let mut ranks = vec![0.0; n];
    let mut sizes = Vec::new();
    let mut i = 0;
    while i < n {
        let mut j = i;
        while j + 1 < n && x[idx[j + 1]] == x[idx[i]] {
            j += 1;
        }
        // Sorted positions i..=j occupy ranks (i+1)..=(j+1); assign their mean.
        let avg = ((i + 1 + j + 1) as f64) / 2.0;
        for &k in &idx[i..=j] {
            ranks[k] = avg;
        }
        sizes.push(j - i + 1);
        i = j + 1;
    }
    (ranks, sizes)
}

/// Lower-tail Student's t CDF, treating an infinite `df` as standard normal.
fn pt_lower(x: f64, df: f64) -> f64 {
    if df.is_infinite() {
        Normal::new(0.0, 1.0).unwrap().cdf(x)
    } else {
        StudentsT::new(0.0, 1.0, df).unwrap().cdf(x)
    }
}

/// Upper-tail Student's t CDF, via central symmetry (`P(T>x)=P(T<-x)`).
fn pt_upper(x: f64, df: f64) -> f64 {
    pt_lower(-x, df)
}

/// Rank-sum test (two-sample Wilcoxon–Mann–Whitney) allowing for correlation
/// between members of the test set.
///
/// `index` are 0-based positions of the test set within `statistics`. Returns
/// `(less, greater)` one-sided p-values, matching the `c(less, greater)` output
/// of limma's `rankSumTestWithCorrelation`.
pub fn rank_sum_test_with_correlation(
    index: &[usize],
    statistics: &[f64],
    correlation: f64,
    df: f64,
) -> (f64, f64) {
    let (r, tie_sizes) = rank_average_and_ties(statistics);
    rank_sum_core(index, statistics.len(), &r, &tie_sizes, correlation, df)
}

/// Core of [`rank_sum_test_with_correlation`] given the average ranks `r` and
/// tie-group sizes `tie_sizes` already computed over the full statistic vector
/// of length `n`. Factored out so competitive callers that test many sets
/// against the *same* statistics (e.g. [`camera_pr`]) rank the universe once and
/// reuse it, rather than re-sorting per set.
fn rank_sum_core(
    index: &[usize],
    n: usize,
    r: &[f64],
    tie_sizes: &[usize],
    correlation: f64,
    df: f64,
) -> (f64, f64) {
    let n1 = index.len();
    let n2 = n - n1;
    let sum_r1: f64 = index.iter().map(|&i| r[i]).sum();

    let n1f = n1 as f64;
    let n2f = n2 as f64;
    let nf = n as f64;

    let u = n1f * n2f + n1f * (n1f + 1.0) / 2.0 - sum_r1;
    let mu = n1f * n2f / 2.0;

    let mut sigma2 = if correlation == 0.0 || n1 == 1 {
        n1f * n2f * (nf + 1.0) / 12.0
    } else {
        // asin(1) = pi/2.
        let s = std::f64::consts::FRAC_PI_2 * n1f * n2f
            + 0.5_f64.asin() * n1f * n2f * (n2f - 1.0)
            + (correlation / 2.0).asin() * n1f * (n1f - 1.0) * n2f * (n2f - 1.0)
            + ((correlation + 1.0) / 2.0).asin() * n1f * (n1f - 1.0) * n2f;
        s / 2.0 / std::f64::consts::PI
    };

    if tie_sizes.iter().any(|&c| c > 1) {
        let adjustment: f64 = tie_sizes
            .iter()
            .map(|&c| {
                let cf = c as f64;
                cf * (cf + 1.0) * (cf - 1.0)
            })
            .sum::<f64>()
            / (nf * (nf + 1.0) * (nf - 1.0));
        sigma2 *= 1.0 - adjustment;
    }

    let sd = sigma2.sqrt();
    let zlowertail = (u + 0.5 - mu) / sd;
    let zuppertail = (u - 0.5 - mu) / sd;

    // Tails reversed on output: R's ranks are the reverse of Mann–Whitney's.
    let less = pt_upper(zuppertail, df);
    let greater = pt_lower(zlowertail, df);
    (less, greater)
}

/// Alternative hypothesis for [`gene_set_test`].
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Alternative {
    /// Genes in the set are up-regulated.
    Up,
    /// Genes in the set are down-regulated.
    Down,
    /// Two-sided: up or down (limma `"either"`/`"two.sided"`).
    Either,
    /// Genes change in either direction (limma `"mixed"`, the default).
    Mixed,
}

/// Competitive gene-set test from gene ranks (`geneSetTest(..., ranks.only=TRUE)`).
///
/// `index` are 0-based positions of the set within `statistics`. Returns the
/// p-value for the requested alternative.
pub fn gene_set_test(index: &[usize], statistics: &[f64], alternative: Alternative) -> f64 {
    let mut stats = statistics.to_vec();
    let mut alt = alternative;
    match alt {
        Alternative::Mixed => {
            for s in stats.iter_mut() {
                *s = s.abs();
            }
        }
        Alternative::Down => {
            for s in stats.iter_mut() {
                *s = -*s;
            }
            alt = Alternative::Up;
        }
        _ => {}
    }
    let (less, greater) = rank_sum_test_with_correlation(index, &stats, 0.0, f64::INFINITY);
    match alt {
        Alternative::Up => greater,
        Alternative::Either => 2.0 * less.min(greater),
        Alternative::Mixed => greater,
        // Down is rewritten to Up above; this arm is unreachable.
        Alternative::Down => less,
    }
}

/// Mean-rank gene-set test (`wilcoxGST`): [`gene_set_test`] with the default
/// `"mixed"` alternative.
pub fn wilcox_gst(index: &[usize], statistics: &[f64]) -> f64 {
    gene_set_test(index, statistics, Alternative::Mixed)
}

/// Direction of enrichment reported by [`camera_pr`].
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Direction {
    /// Set statistics shifted up relative to the rest.
    Up,
    /// Set statistics shifted down relative to the rest.
    Down,
}

/// One gene set's result from [`camera_pr`].
#[derive(Clone, Debug)]
pub struct CameraResult {
    /// 0-based position of this set in the input `index` list.
    pub set: usize,
    /// Number of genes in the set.
    pub n_genes: usize,
    /// Net direction of enrichment.
    pub direction: Direction,
    /// Two-sided competitive p-value.
    pub p_value: f64,
    /// Benjamini–Hochberg adjusted p-value across all sets.
    pub fdr: f64,
}

fn mean(x: &[f64]) -> f64 {
    x.iter().sum::<f64>() / x.len() as f64
}

/// Sample variance (divisor `n-1`), matching R's `var`.
fn sample_var(x: &[f64]) -> f64 {
    let n = x.len();
    let m = mean(x);
    x.iter().map(|&v| (v - m) * (v - m)).sum::<f64>() / (n as f64 - 1.0)
}

/// Pre-ranked competitive gene-set test (`cameraPR`) for the default
/// `directional=TRUE`, fixed scalar `inter.gene.cor` case.
///
/// `statistic` is a gene-level statistic (e.g. a moderated t); `index` lists
/// the sets as 0-based gene positions. With `use_ranks=false` the parametric
/// two-sample t form is used (df = `G-2`); with `use_ranks=true` the
/// correlation-adjusted rank-sum test is used. Results carry the
/// Benjamini–Hochberg FDR and, when `sort` is set and there is more than one
/// set, are ordered by ascending p-value (stable on ties, as in R).
pub fn camera_pr(
    statistic: &[f64],
    index: &[Vec<usize>],
    inter_gene_cor: f64,
    use_ranks: bool,
    sort: bool,
) -> Vec<CameraResult> {
    let g = statistic.len();
    let gf = g as f64;
    let mean_stat = mean(statistic);
    let var_stat = sample_var(statistic);
    let df = if use_ranks { f64::INFINITY } else { gf - 2.0 };

    // The rank path's average ranks and tie-group sizes depend only on the
    // shared `statistic`, so compute them once rather than re-sorting per set.
    let ranks = use_ranks.then(|| rank_average_and_ties(statistic));

    let mut rows: Vec<CameraResult> = Vec::with_capacity(index.len());
    for (si, iset) in index.iter().enumerate() {
        let m = iset.len();
        let (down, up) = if let Some((r, tie_sizes)) = ranks.as_ref() {
            rank_sum_core(iset, g, r, tie_sizes, inter_gene_cor, df)
        } else {
            let mf = m as f64;
            let m2 = gf - mf;
            let vif = 1.0 + (mf - 1.0) * inter_gene_cor;
            let mean_in_set = iset.iter().map(|&i| statistic[i]).sum::<f64>() / mf;
            let delta = gf / m2 * (mean_in_set - mean_stat);
            let var_pooled = ((gf - 1.0) * var_stat - delta * delta * mf * m2 / gf) / (gf - 2.0);
            let t = delta / (var_pooled * (vif / mf + 1.0 / m2)).sqrt();
            (pt_lower(t, df), pt_upper(t, df))
        };
        let p_value = 2.0 * down.min(up);
        let direction = if down < up {
            Direction::Down
        } else {
            Direction::Up
        };
        rows.push(CameraResult {
            set: si,
            n_genes: m,
            direction,
            p_value,
            fdr: f64::NAN,
        });
    }

    // BH adjustment across sets (only when there is more than one set).
    if rows.len() > 1 {
        let pvals: Vec<f64> = rows.iter().map(|r| r.p_value).collect();
        let fdr = crate::toptable::p_adjust_bh(&pvals);
        for (r, f) in rows.iter_mut().zip(fdr) {
            r.fdr = f;
        }
    } else if let Some(r) = rows.first_mut() {
        r.fdr = r.p_value;
    }

    if sort && rows.len() > 1 {
        rows.sort_by(|a, b| a.p_value.partial_cmp(&b.p_value).unwrap());
    }
    rows
}

/// Variance-inflation factor and mean inter-gene correlation of the residuals
/// (`interGeneCorrelation`).
///
/// `y` is `G x n` (genes by samples), `design` is `n x p`. The residual effects
/// are the trailing `n - rank(design)` rows of `Q' y'`; each gene is scaled to
/// unit mean square, then averaged across genes per residual coordinate.
/// Returns `(vif, correlation)` with `correlation = (vif - 1) / (G - 1)`.
pub fn inter_gene_correlation(y: &Array2<f64>, design: &Array2<f64>) -> (f64, f64) {
    let g = y.nrows();
    let n = y.ncols();
    let rank = matrix_rank(design);
    let nres = n - rank;
    let qfull = qr_full_q(design);
    let effects = qfull.t().dot(&y.t()); // n x G = Q' t(y)

    let mut sigma = vec![0.0; g];
    for (gi, s) in sigma.iter_mut().enumerate() {
        let mut acc = 0.0;
        for k in rank..n {
            let e = effects[[k, gi]];
            acc += e * e;
        }
        *s = (acc / nres as f64).sqrt();
    }
    let mut sumsq = 0.0;
    for k in rank..n {
        let mut ubar = 0.0;
        for gi in 0..g {
            ubar += effects[[k, gi]] / sigma[gi];
        }
        ubar /= g as f64;
        sumsq += ubar * ubar;
    }
    let vif = g as f64 * sumsq / nres as f64;
    let correlation = (vif - 1.0) / (g as f64 - 1.0);
    (vif, correlation)
}

/// Z-score equivalent of a t-statistic via Hill's 1970 approximation
/// (`.zscoreTHill`; `zscoreT(approx=TRUE, method="hill")`). Accurate for
/// `df >= 2`; requires `df > 0.5`.
fn zscore_t_hill(x: f64, df: f64) -> f64 {
    let a = df - 0.5;
    let b = 48.0 * a * a;
    let mut z = a * (x * x / df).ln_1p();
    z = (((((-0.4 * z - 3.3) * z - 24.0) * z - 85.5) / (0.8 * z * z + 100.0 + b) + z + 3.0) / b
        + 1.0)
        * z.sqrt();
    z * x.signum()
}

/// Reorder design columns so column `coef` becomes the last one, preserving the
/// order of the rest (limma's `design[,c((1:p)[-contrast],contrast)]`).
fn move_coef_last(design: &Array2<f64>, coef: usize) -> Array2<f64> {
    let p = design.ncols();
    if coef == p - 1 {
        return design.to_owned();
    }
    let n = design.nrows();
    let mut order: Vec<usize> = (0..p).filter(|&c| c != coef).collect();
    order.push(coef);
    let mut out = Array2::<f64>::zeros((n, p));
    for (newj, &oldj) in order.iter().enumerate() {
        out.column_mut(newj).assign(&design.column(oldj));
    }
    out
}

/// Result of [`contrast_as_coef`].
#[derive(Clone, Debug)]
pub struct ContrastAsCoef {
    /// Reformed design matrix (`n x p`) in which the requested contrasts appear
    /// as plain coefficients.
    pub design: Array2<f64>,
    /// 0-based columns of `design` that hold the contrast coefficients.
    pub coef: Vec<usize>,
    /// Rank of the contrast matrix (the number of contrast coefficients).
    pub rank: usize,
}

/// Reform a design matrix so that one or more contrasts become simple
/// coefficients (`contrastAsCoef`).
///
/// `design` is `n x p`; `contrast` is `p x ncontrasts`. With `first = true` the
/// contrast coefficients occupy the leading columns of the reformed design,
/// otherwise the trailing columns (limma's `first` argument). The non-contrast
/// columns are the orthogonal completion of the contrast space.
///
/// Only full-column-rank contrasts are supported: limma's rank-deficient path
/// relies on LINPACK column pivoting in `qr`, which this port does not
/// replicate. The completion columns follow the same Householder convention as
/// R's `qr`, so the reformed design matches limma to rounding.
pub fn contrast_as_coef(
    design: &Array2<f64>,
    contrast: &Array2<f64>,
    first: bool,
) -> Result<ContrastAsCoef> {
    let n = design.nrows();
    let p = design.ncols();
    if contrast.nrows() != p {
        bail!(
            "contrast_as_coef: contrast rows ({}) must match design cols ({})",
            contrast.nrows(),
            p
        );
    }
    let nc = contrast.ncols();
    let rank = matrix_rank(contrast);
    if rank == 0 {
        bail!("contrast_as_coef: contrast is all zero");
    }
    if rank != nc {
        bail!(
            "contrast_as_coef: only full-column-rank contrasts are supported (rank {} of {} columns)",
            rank,
            nc
        );
    }
    let k = nc;

    // designT = Q' t(design) using the full orthogonal factor of the contrast.
    let qfull = qr_full_q(contrast); // p x p
    let (_, rmat) = qr_econ(contrast); // k x k upper triangular
    let mut designt = qfull.t().dot(&design.t()); // p x n

    // Replace the leading k rows with R^-1 designT[0..k] (back-substitution),
    // turning the contrast directions into plain coefficients.
    for col in 0..n {
        for i in (0..k).rev() {
            let mut s = designt[[i, col]];
            for j in (i + 1)..k {
                s -= rmat[[i, j]] * designt[[j, col]];
            }
            designt[[i, col]] = s / rmat[[i, i]];
        }
    }
    let reformed = designt.t().to_owned(); // n x p, columns 0..k the contrasts

    // Place contrast coefficients first or last, as requested.
    if first {
        Ok(ContrastAsCoef {
            design: reformed,
            coef: (0..k).collect(),
            rank,
        })
    } else {
        let mut out = Array2::<f64>::zeros((n, p));
        for (newj, oldj) in (k..p).chain(0..k).enumerate() {
            out.column_mut(newj).assign(&reformed.column(oldj));
        }
        Ok(ContrastAsCoef {
            design: out,
            coef: (p - k..p).collect(),
            rank,
        })
    }
}

/// Competitive gene-set test from an expression matrix and design
/// (`camera`/`camera.default`) with a fixed inter-gene correlation.
///
/// `exprs` is `G x n` (genes by samples), `design` is `n x p`, and `coef` is the
/// 0-based design column whose contrast is tested (limma's `contrast`, default
/// the last column). Moderated-t statistics are computed internally from the QR
/// effects and [`squeeze_var`] (`trend.var = FALSE`, `robust = FALSE`), then
/// converted to z-scores with Hill's approximation (`use_ranks = false`) or used
/// directly (`use_ranks = true`) before the per-set machinery of [`camera_pr`].
///
/// `inter_gene_cor` is clamped at 0 (limma's `allow.neg.cor = FALSE` default), a
/// no-op for the usual small positive correlation.
pub fn camera(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    coef: usize,
    index: &[Vec<usize>],
    inter_gene_cor: f64,
    use_ranks: bool,
    sort: bool,
) -> Result<Vec<CameraResult>> {
    let g = exprs.nrows();
    let n = exprs.ncols();
    let p = design.ncols();
    assert!(g >= 3, "camera: need at least 3 genes");
    let df_residual = n as f64 - p as f64;
    assert!(df_residual >= 1.0, "camera: no residual df");

    // Reorder so the tested contrast is the final column, then take QR effects.
    let design = move_coef_last(design, coef);
    let qfull = qr_full_q(&design);
    let (_, r) = qr_econ(&design);
    let effects = qfull.t().dot(&exprs.t()); // n x G = Q' t(y)

    // Unscaled t = the p-th effect, signed by the R pivot. The product of the
    // effect and sign(R[p,p]) is invariant to the QR sign convention, so it
    // matches limma even though the Householder signs may differ from LAPACK.
    let sign = if r[[p - 1, p - 1]] < 0.0 { -1.0 } else { 1.0 };
    let unscaledt: Vec<f64> = (0..g).map(|gi| effects[[p - 1, gi]] * sign).collect();

    // Residual variance per gene = mean square of the trailing effects.
    let mut sigma2 = Array1::<f64>::zeros(g);
    for (gi, s) in sigma2.iter_mut().enumerate() {
        let mut acc = 0.0;
        for k in p..n {
            let e = effects[[k, gi]];
            acc += e * e;
        }
        *s = acc / df_residual;
    }

    let sv = squeeze_var(&sigma2, &Array1::from_elem(g, df_residual), None, false)?;

    let mut stat = vec![0.0; g];
    if use_ranks {
        for gi in 0..g {
            stat[gi] = unscaledt[gi] / sv.var_post[gi].sqrt();
        }
    } else {
        let df_total = (df_residual + sv.df_prior[0]).min(g as f64 * df_residual);
        for gi in 0..g {
            let modt = unscaledt[gi] / sv.var_post[gi].sqrt();
            stat[gi] = zscore_t_hill(modt, df_total);
        }
    }

    let cor = inter_gene_cor.max(0.0);
    Ok(camera_pr(&stat, index, cor, use_ranks, sort))
}

/// Matrix of genewise effects with `n - p + 1` columns (`.lmEffects`, no
/// weights/blocks): column 0 is the sign-corrected contrast effect, the rest are
/// the residual effects. `exprs` is `G x n`, `design` is `n x p`, `coef` the
/// 0-based contrast column.
fn lm_effects(exprs: &Array2<f64>, design: &Array2<f64>, coef: usize) -> Array2<f64> {
    let g = exprs.nrows();
    let n = exprs.ncols();
    let p = design.ncols();
    let design = move_coef_last(design, coef);
    let qfull = qr_full_q(&design);
    let (_, r) = qr_econ(&design);
    let full = qfull.t().dot(&exprs.t()); // n x G = Q' t(y)
    let signc = if r[[p - 1, p - 1]] < 0.0 { -1.0 } else { 1.0 };
    let neff = n - p + 1;
    let mut eff = Array2::<f64>::zeros((g, neff));
    for gi in 0..g {
        eff[[gi, 0]] = full[[p - 1, gi]] * signc;
        for k in 1..neff {
            eff[[gi, k]] = full[[p - 1 + k, gi]];
        }
    }
    eff
}

/// Sort order for [`fry`] (`sort` argument).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum FrySort {
    /// By directional p-value, then descending set size, then mixed p-value.
    Directional,
    /// By mixed p-value, then descending set size, then directional p-value.
    Mixed,
    /// Leave sets in input order.
    NoSort,
}

/// One gene set's result from [`fry`].
#[derive(Clone, Debug)]
pub struct FryResult {
    /// 0-based position of this set in the input `index` list.
    pub set: usize,
    /// Number of genes in the set.
    pub n_genes: usize,
    /// Net direction of enrichment.
    pub direction: Direction,
    /// Directional (two-sided) p-value.
    pub p_value: f64,
    /// Benjamini–Hochberg FDR for `p_value` (equals `p_value` for a single set).
    pub fdr: f64,
    /// Mixed (non-directional) p-value.
    pub p_value_mixed: f64,
    /// Benjamini–Hochberg FDR for `p_value_mixed` (equals it for a single set).
    pub fdr_mixed: f64,
}

/// Mixed (non-directional) p-value for one set via the `nrot = Inf` Beta
/// approximation of `.fryEffects` (`m > 1`). `eff` is the standardized effects
/// matrix; `iset` the 0-based set members.
fn fry_mixed_pvalue(eff: &Array2<f64>, iset: &[usize]) -> f64 {
    let neff = eff.ncols();
    let m = iset.len();

    // Squared singular values of the m x neff set block = eigenvalues of the
    // smaller Gram matrix, descending.
    let mut a: Vec<f64> = if neff <= m {
        let mut gram = Array2::<f64>::zeros((neff, neff));
        for &gi in iset {
            for i in 0..neff {
                for j in 0..neff {
                    gram[[i, j]] += eff[[gi, i]] * eff[[gi, j]];
                }
            }
        }
        eigh(&gram).0.to_vec()
    } else {
        let mut gram = Array2::<f64>::zeros((m, m));
        for (ai, &gi) in iset.iter().enumerate() {
            for (bi, &gj) in iset.iter().enumerate() {
                let mut s = 0.0;
                for k in 0..neff {
                    s += eff[[gi, k]] * eff[[gj, k]];
                }
                gram[[ai, bi]] = s;
            }
        }
        eigh(&gram).0.to_vec()
    };
    a.reverse(); // descending

    let d1 = a.len();
    let d1f = d1 as f64;
    let d = d1f - 1.0;
    let beta_mean = 1.0 / d1f;
    let beta_var = d / d1f / d1f / (d1f / 2.0 + 1.0);

    let a1 = a[0];
    let ad1 = a[d1 - 1];
    let span = a1 - ad1;
    let sum_col1_sq: f64 = iset.iter().map(|&gi| eff[[gi, 0]] * eff[[gi, 0]]).sum();
    let fobs = (sum_col1_sq - ad1) / span;

    let suma: f64 = a.iter().sum();
    let suma2: f64 = a.iter().map(|&v| v * v).sum();
    let frb_mean = (suma * beta_mean - ad1) / span;
    // A' COV A with COV = beta_var I - (beta_var/d)(J - I).
    let quad = beta_var * suma2 - (beta_var / d) * (suma * suma - suma2);
    let frb_var = quad / (span * span);

    let alphaplusbeta = frb_mean * (1.0 - frb_mean) / frb_var - 1.0;
    let alpha = alphaplusbeta * frb_mean;
    let beta = alphaplusbeta - alpha;
    let dist = Beta::new(alpha, beta).unwrap();
    1.0 - dist.cdf(fobs)
}

/// Fast approximation to `roast` (`fry`): the `nrot = Inf`, `prior.df = Inf`
/// limit, giving deterministic directional and mixed competitive p-values.
///
/// `exprs` is `G x n` (genes by samples), `design` is `n x p`, and `coef` is the
/// 0-based contrast column (limma's `contrast`, default the last). Uses the
/// default `standardize = "posterior.sd"`: robust genewise variances squeezed
/// toward an [`fit_fdist`] prior estimated from the residual variances.
pub fn fry(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    coef: usize,
    index: &[Vec<usize>],
    sort: FrySort,
) -> Result<Vec<FryResult>> {
    let mut eff = lm_effects(exprs, design, coef);
    let g = eff.nrows();
    let neff = eff.ncols();
    let df_residual = (neff - 1) as f64;

    // Expected maximum squared effect under the null, by Gauss–Legendre
    // quadrature; `qchisq(x, df=1) = qnorm((x+1)/2)^2`.
    let (nodes, weights) = gauss_legendre_01(128);
    let normal = Normal::new(0.0, 1.0).unwrap();
    let mut eu2max = 0.0;
    for (&x, &w) in nodes.iter().zip(weights.iter()) {
        let q = normal.inverse_cdf((x + 1.0) / 2.0);
        eu2max += (df_residual + 1.0) * x.powf(df_residual) * (q * q) * w;
    }

    // Robust variance (drop the largest squared effect) and residual variance.
    let mut s2_robust = Array1::<f64>::zeros(g);
    let mut s2 = Array1::<f64>::zeros(g);
    for gi in 0..g {
        let mut sumsq = 0.0;
        let mut maxsq = f64::NEG_INFINITY;
        let mut sumsq_resid = 0.0;
        for k in 0..neff {
            let e2 = eff[[gi, k]] * eff[[gi, k]];
            sumsq += e2;
            if e2 > maxsq {
                maxsq = e2;
            }
            if k >= 1 {
                sumsq_resid += e2;
            }
        }
        s2_robust[gi] = (sumsq - maxsq) / (df_residual + 1.0 - eu2max);
        s2[gi] = sumsq_resid / df_residual;
    }

    // Empirical-Bayes squeeze: prior from residual variances, applied to robust.
    let (scale, df2) = fit_fdist(&s2, &Array1::from_elem(g, df_residual));
    let s2_robust = squeeze_var_post(
        &s2_robust,
        &Array1::from_elem(g, 0.92 * df_residual),
        &Array1::from_elem(g, scale),
        &Array1::from_elem(g, df2),
    );
    for gi in 0..g {
        let s = s2_robust[gi].sqrt();
        for k in 0..neff {
            eff[[gi, k]] /= s;
        }
    }

    // Per-set directional and mixed statistics.
    let mut rows: Vec<FryResult> = Vec::with_capacity(index.len());
    for (si, iset) in index.iter().enumerate() {
        let m = iset.len();
        let mut colmean = vec![0.0; neff];
        for &gi in iset {
            for (k, cm) in colmean.iter_mut().enumerate() {
                *cm += eff[[gi, k]];
            }
        }
        for cm in colmean.iter_mut() {
            *cm /= m as f64;
        }
        let mean_resid_sq = colmean[1..].iter().map(|&v| v * v).sum::<f64>() / (neff - 1) as f64;
        let t_stat = colmean[0] / mean_resid_sq.sqrt();
        let direction = if t_stat < 0.0 {
            Direction::Down
        } else {
            Direction::Up
        };
        let p_value = 2.0 * pt_lower(-t_stat.abs(), df_residual);
        let p_value_mixed = if m > 1 {
            fry_mixed_pvalue(&eff, iset)
        } else {
            p_value
        };
        rows.push(FryResult {
            set: si,
            n_genes: m,
            direction,
            p_value,
            fdr: f64::NAN,
            p_value_mixed,
            fdr_mixed: f64::NAN,
        });
    }

    if rows.len() > 1 {
        let p: Vec<f64> = rows.iter().map(|r| r.p_value).collect();
        let pm: Vec<f64> = rows.iter().map(|r| r.p_value_mixed).collect();
        let fdr = crate::toptable::p_adjust_bh(&p);
        let fdr_mixed = crate::toptable::p_adjust_bh(&pm);
        for (r, (f, fm)) in rows.iter_mut().zip(fdr.into_iter().zip(fdr_mixed)) {
            r.fdr = f;
            r.fdr_mixed = fm;
        }
    } else if let Some(r) = rows.first_mut() {
        r.fdr = r.p_value;
        r.fdr_mixed = r.p_value_mixed;
    }

    match sort {
        FrySort::Directional => rows.sort_by(|a, b| {
            a.p_value
                .partial_cmp(&b.p_value)
                .unwrap()
                .then(b.n_genes.cmp(&a.n_genes))
                .then(a.p_value_mixed.partial_cmp(&b.p_value_mixed).unwrap())
        }),
        FrySort::Mixed => rows.sort_by(|a, b| {
            a.p_value_mixed
                .partial_cmp(&b.p_value_mixed)
                .unwrap()
                .then(b.n_genes.cmp(&a.n_genes))
                .then(a.p_value.partial_cmp(&b.p_value).unwrap())
        }),
        FrySort::NoSort => {}
    }
    Ok(rows)
}

/// Result of a single-set rotation gene-set test ([`roast`]).
///
/// limma reports a four-row data frame with rows `Down`, `Up`, `UpOrDown`,
/// `Mixed`; the arrays here follow that row order.
#[derive(Clone, Debug)]
pub struct Roast {
    /// Active proportions `[Down, Up, UpOrDown, Mixed]`, i.e.
    /// `[a2, a1, max(a1, a2), a1 + a2]` where `a1`/`a2` are the fractions of the
    /// set with moderated z above `+sqrt(2)` / below `-sqrt(2)`.
    pub active_prop: [f64; 4],
    /// Rotation p-values `[Down, Up, UpOrDown, Mixed]`.
    pub p_value: [f64; 4],
    /// Number of genes in the tested set.
    pub n_genes_in_set: usize,
}

/// Rotation gene-set test for a single set (`roast`).
///
/// Ports limma's default configuration: `set.statistic = "mean"`,
/// `approx.zscore = TRUE`, `legacy = FALSE`, with no gene weights, array weights
/// or blocking. `exprs` is `G x n` (genes by samples), `design` is `n x p`,
/// `coef` the 0-based contrast column, `index` the 0-based members of the set
/// and `nrot` the number of rotations (limma's default is `1999`).
///
/// `rng` is supplied already seeded by the caller — equivalent to calling R's
/// `set.seed` immediately before `roast`. The rotations are the test's only
/// source of randomness; they draw from `rng` exactly as `.roastEffects` does
/// (`rnorm(nroti * neffects)` per chunk of `1000`, filled column-major), so a
/// bit-exact [`RRng`] reproduces limma's Monte-Carlo counts.
pub fn roast(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    coef: usize,
    index: &[usize],
    nrot: usize,
    rng: &mut RRng,
) -> Result<Roast> {
    let (eff, var_prior, df_prior, var_post_all) = roast_prepare(exprs, design, coef)?;
    let (set_eff, var_post) = subset_effects(&eff, &var_post_all, index);
    Ok(roast_effects(
        &set_eff, var_prior, df_prior, &var_post, nrot, rng,
    ))
}

/// Shared preprocessing for [`roast`]/[`mroast`]: returns the gene-wise effects
/// matrix (`G x neffects`, column 0 the primary effect), the squeezed prior
/// `(var_prior, df_prior)` estimated over all genes, and the per-gene posterior
/// variances. limma computes these once and reuses them across every set.
fn roast_prepare(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    coef: usize,
) -> Result<(Array2<f64>, f64, f64, Array1<f64>)> {
    let eff = lm_effects(exprs, design, coef);
    let g = eff.nrows();
    let neff = eff.ncols();
    let df_residual = (neff - 1) as f64;

    let mut s2 = Array1::<f64>::zeros(g);
    for gi in 0..g {
        let mut acc = 0.0;
        for k in 1..neff {
            acc += eff[[gi, k]] * eff[[gi, k]];
        }
        s2[gi] = acc / df_residual;
    }
    let sv = squeeze_var(&s2, &Array1::from_elem(g, df_residual), None, false)?;
    Ok((eff, sv.var_prior[0], sv.df_prior[0], sv.var_post))
}

/// Slice the effects matrix and posterior variances down to one gene set
/// (0-based `index`), preserving the set's member order.
fn subset_effects(
    eff: &Array2<f64>,
    var_post_all: &Array1<f64>,
    index: &[usize],
) -> (Array2<f64>, Vec<f64>) {
    let neff = eff.ncols();
    let nset = index.len();
    let mut set_eff = Array2::<f64>::zeros((nset, neff));
    let mut var_post = vec![0.0; nset];
    for (si, &gi) in index.iter().enumerate() {
        for k in 0..neff {
            set_eff[[si, k]] = eff[[gi, k]];
        }
        var_post[si] = var_post_all[gi];
    }
    (set_eff, var_post)
}

/// Rotation core (`.roastEffects`) for the default `set.statistic = "mean"`,
/// `approx.zscore = TRUE`, `legacy = FALSE`, no-gene-weights path. `effects` is
/// the `nset x neffects` block for one set (column 0 is the primary effect);
/// `var_prior` / `df_prior` are the scalar squeezed prior and `var_post` the
/// per-gene posterior variances.
fn roast_effects(
    effects: &Array2<f64>,
    var_prior: f64,
    df_prior: f64,
    var_post: &[f64],
    nrot: usize,
    rng: &mut RRng,
) -> Roast {
    let nset = effects.nrows();
    let neff = effects.ncols();
    let df_residual = (neff - 1) as f64;
    let df_total = df_prior + df_residual;
    let df_total_winsor = df_total.min(10000.0);
    let prior_term = df_prior * var_prior;
    let nset_f = nset as f64;
    let sqrt2 = std::f64::consts::SQRT_2;

    // Observed moderated z-statistics, active proportions and set statistics.
    let mut sum_modt = 0.0;
    let mut sum_abs_modt = 0.0;
    let mut n_up = 0usize;
    let mut n_down = 0usize;
    for gi in 0..nset {
        let modt = zscore_t(
            effects[[gi, 0]] / var_post[gi].sqrt(),
            df_total_winsor,
            ZscoreTMethod::Bailey,
        );
        sum_modt += modt;
        sum_abs_modt += modt.abs();
        if modt > sqrt2 {
            n_up += 1;
        }
        if modt < -sqrt2 {
            n_down += 1;
        }
    }
    let a1 = n_up as f64 / nset_f;
    let a2 = n_down as f64 / nset_f;
    let m = sum_modt / nset_f;
    let statobs_down = -m;
    let statobs_up = m;
    let statobs_mixed = sum_abs_modt / nset_f;

    // Per-gene sum of squared effects (the rotation-invariant total).
    let mut rowsq = vec![0.0; nset];
    for gi in 0..nset {
        let mut acc = 0.0;
        for k in 0..neff {
            acc += effects[[gi, k]] * effects[[gi, k]];
        }
        rowsq[gi] = acc;
    }

    // Rotations are conducted in chunks; the chunk sizes fix the per-chunk RNG
    // draw counts, which must match limma exactly. The draw itself (`rng.rnorm`)
    // stays serial to preserve limma's exact Mersenne-Twister draw order — the
    // source of the bit-exact p-values — but the per-rotation work that *consumes*
    // those draws is independent across rotations and feeds only integer counters,
    // whose sum is order-independent. So that inner loop is parallelised across
    // rotations (behind the `parallel` feature) with a result that is bit-identical
    // to the serial path, and to limma.
    let chunk = 1000usize;
    let nchunk = nrot.div_ceil(chunk);
    let nroti0 = nrot.div_ceil(nchunk);
    let overshoot = nchunk * nroti0 - nrot;

    let mut count = [0i64; 4];
    for chunki in 0..nchunk {
        let nroti = if chunki == nchunk - 1 {
            nroti0 - overshoot
        } else {
            nroti0
        };
        // rnorm(nroti * neffects), interpreted column-major as nroti x neffects.
        let draws = rng.rnorm(nroti * neff);
        let ctx = RotationCtx {
            draws: &draws,
            nroti,
            neff,
            nset,
            effects,
            rowsq: &rowsq,
            prior_term,
            df_residual,
            df_total,
            df_total_winsor,
            nset_f,
            statobs_down,
            statobs_up,
            statobs_mixed,
        };
        // [down, up, mixed] counts summed over this chunk's rotations.
        let part = ctx.count_rotations();
        count[0] += part[0];
        count[1] += part[1];
        count[3] += part[2];
    }
    // For "mean", UpOrDown is the more significant of the one-sided counts.
    count[2] = count[0].min(count[1]);

    let nrot_i = nrot as i64;
    let denom = [2 * nrot_i + 1, 2 * nrot_i + 1, nrot_i + 1, nrot_i + 1];
    let mut p_value = [0.0; 4];
    for i in 0..4 {
        p_value[i] = (count[i] as f64 + 1.0) / denom[i] as f64;
    }

    Roast {
        active_prop: [a2, a1, a1.max(a2), a1 + a2],
        p_value,
        n_genes_in_set: nset,
    }
}

/// Borrowed, fully-immutable context for one chunk's worth of [`roast_effects`]
/// rotations. Every field a single rotation reads is here, so the per-rotation
/// statistic is a pure function of the rotation index `r`: rotations within a
/// chunk are independent and accumulate only into integer counters. That makes
/// the chunk's total order-independent, so it can be summed with a parallel (or
/// serial) reduction that is bit-identical either way — and identical to limma.
struct RotationCtx<'a> {
    /// `rnorm(nroti * neff)` for this chunk, column-major as `nroti x neff`.
    draws: &'a [f64],
    nroti: usize,
    neff: usize,
    nset: usize,
    /// `nset x neff` effects block for the set (column 0 the primary effect).
    effects: &'a Array2<f64>,
    /// Per-gene sum of squared effects (the rotation-invariant total).
    rowsq: &'a [f64],
    prior_term: f64,
    df_residual: f64,
    df_total: f64,
    df_total_winsor: f64,
    nset_f: f64,
    statobs_down: f64,
    statobs_up: f64,
    statobs_mixed: f64,
}

impl RotationCtx<'_> {
    /// Counts contributed by a single rotation `r`, as
    /// `[#{rot > statobs_down}, #{rot > statobs_up}, #{mixed > statobs_mixed}]`
    /// (the first two each count both the down and up rotated statistics, exactly
    /// as limma tallies `statrot[,c("down","up")]`). `zrow` is a caller-owned
    /// scratch buffer of length `neff`, reused across rotations to avoid
    /// allocating inside the hot loop.
    #[inline]
    fn count_one(&self, r: usize, zrow: &mut [f64]) -> [i64; 3] {
        // Unit-normalize the rotation row (limma's modtr / sqrt(rowSums^2)).
        let mut znorm = 0.0;
        for (k, z) in zrow.iter_mut().enumerate() {
            let v = self.draws[k * self.nroti + r];
            *z = v;
            znorm += v * v;
        }
        let znorm = znorm.sqrt();
        for z in zrow.iter_mut() {
            *z /= znorm;
        }
        // Rotated, moderated z-statistics for each gene in the set.
        let mut sum_z = 0.0;
        let mut sum_abs_z = 0.0;
        for gi in 0..self.nset {
            // zrow.len() == neff, so this is k = 0..neff in order (bit-identical
            // accumulation), just without the redundant bounds check clippy flags.
            let mut t = 0.0;
            for (k, &zv) in zrow.iter().enumerate() {
                t += self.effects[[gi, k]] * zv;
            }
            let s2r0 = (self.rowsq[gi] - t * t) / self.df_residual;
            let s2r = (self.prior_term + self.df_residual * s2r0) / self.df_total;
            let z = zscore_t(t / s2r.sqrt(), self.df_total_winsor, ZscoreTMethod::Bailey);
            sum_z += z;
            sum_abs_z += z.abs();
        }
        let up_r = sum_z / self.nset_f;
        let down_r = -up_r;
        let mixed_r = sum_abs_z / self.nset_f;
        [
            (down_r > self.statobs_down) as i64 + (up_r > self.statobs_down) as i64,
            (down_r > self.statobs_up) as i64 + (up_r > self.statobs_up) as i64,
            (mixed_r > self.statobs_mixed) as i64,
        ]
    }

    /// Sum [`Self::count_one`] over all `nroti` rotations in the chunk. Parallel
    /// across rotations under the `parallel` feature; because the reduction is
    /// over integer counters it is bit-identical to the serial fold (and limma).
    #[cfg(feature = "parallel")]
    fn count_rotations(&self) -> [i64; 3] {
        use rayon::prelude::*;
        (0..self.nroti)
            .into_par_iter()
            .fold(
                || ([0i64; 3], vec![0.0; self.neff]),
                |(mut acc, mut zrow), r| {
                    let c = self.count_one(r, &mut zrow);
                    acc[0] += c[0];
                    acc[1] += c[1];
                    acc[2] += c[2];
                    (acc, zrow)
                },
            )
            .map(|(acc, _)| acc)
            .reduce(|| [0i64; 3], |a, b| [a[0] + b[0], a[1] + b[1], a[2] + b[2]])
    }

    /// Serial fallback (`--no-default-features`): a single reused scratch buffer.
    #[cfg(not(feature = "parallel"))]
    fn count_rotations(&self) -> [i64; 3] {
        let mut acc = [0i64; 3];
        let mut zrow = vec![0.0; self.neff];
        for r in 0..self.nroti {
            let c = self.count_one(r, &mut zrow);
            acc[0] += c[0];
            acc[1] += c[1];
            acc[2] += c[2];
        }
        acc
    }
}

/// One row of the [`mroast`] result table (limma's `mroast` data frame).
#[derive(Clone, Debug)]
pub struct MroastRow {
    /// 0-based position of this set in the input `index` list, recorded before
    /// any sorting so the caller can recover the original order.
    pub set: usize,
    /// Number of genes in the set (`NGenes`).
    pub n_genes: usize,
    /// Active proportion in the down direction (`PropDown`).
    pub prop_down: f64,
    /// Active proportion in the up direction (`PropUp`).
    pub prop_up: f64,
    /// Net direction (`Direction`): [`Direction::Up`] when the up p-value is the
    /// smaller of the two one-sided p-values, otherwise [`Direction::Down`].
    pub direction: Direction,
    /// Two-sided (UpOrDown) rotation p-value (`PValue`).
    pub p_value: f64,
    /// Benjamini-Hochberg FDR across sets over the two-sided p-values (`FDR`).
    pub fdr: f64,
    /// Mixed, non-directional rotation p-value (`PValue.Mixed`).
    pub p_value_mixed: f64,
    /// Benjamini-Hochberg FDR across sets over the mixed p-values (`FDR.Mixed`).
    pub fdr_mixed: f64,
}

/// Multi-set rotation gene-set test (`mroast`).
///
/// Runs the [`roast`] rotation test for every set in `index` (each a slice of
/// 0-based gene indices), sharing the effects matrix and empirical-Bayes prior
/// across sets, then assembles limma's `mroast` table with Benjamini-Hochberg
/// FDRs computed across the sets. The default `set.statistic = "mean"`,
/// `approx.zscore = TRUE`, `legacy = FALSE`, no-gene-weights path is ported.
///
/// `midp` toggles limma's default mid-p correction (`midp = TRUE`): the FDRs are
/// computed from p-values shifted down by `1/2/(nrot+1)` and then floored back
/// at the raw rotation p-value. `sort` orders the rows ([`FrySort::Directional`]
/// is limma's default; [`FrySort::Mixed`] / [`FrySort::NoSort`] match
/// `sort = "mixed"` / `"none"`). Each [`MroastRow::set`] records the row's
/// original 0-based position in `index`.
///
/// `rng` is supplied already seeded by the caller. The sets are processed in
/// input order through a single shared `rng`, exactly as limma reuses the
/// rotation stream across sets, so a bit-exact [`RRng`] reproduces limma's
/// Monte-Carlo counts.
#[allow(clippy::too_many_arguments)]
pub fn mroast(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    coef: usize,
    index: &[Vec<usize>],
    nrot: usize,
    midp: bool,
    sort: FrySort,
    rng: &mut RRng,
) -> Result<Vec<MroastRow>> {
    let (eff, var_prior, df_prior, var_post_all) = roast_prepare(exprs, design, coef)?;

    let mut rows = Vec::with_capacity(index.len());
    for (si, set) in index.iter().enumerate() {
        let (set_eff, var_post) = subset_effects(&eff, &var_post_all, set);
        let r = roast_effects(&set_eff, var_prior, df_prior, &var_post, nrot, rng);
        // Direction follows the smaller one-sided p-value (ties resolve to Down,
        // matching R's `pv[,"Up"] < pv[,"Down"]`).
        let direction = if r.p_value[1] < r.p_value[0] {
            Direction::Up
        } else {
            Direction::Down
        };
        rows.push(MroastRow {
            set: si,
            n_genes: r.n_genes_in_set,
            prop_down: r.active_prop[0],
            prop_up: r.active_prop[1],
            direction,
            p_value: r.p_value[2],
            fdr: f64::NAN,
            p_value_mixed: r.p_value[3],
            fdr_mixed: f64::NAN,
        });
    }

    // Mid-p shift, then Benjamini-Hochberg across sets, then (for mid-p) floor
    // each FDR back at its raw rotation p-value.
    let midp_adj = if midp { 0.5 / (nrot as f64 + 1.0) } else { 0.0 };
    let two_sided: Vec<f64> = rows.iter().map(|r| r.p_value - midp_adj).collect();
    let mixed: Vec<f64> = rows.iter().map(|r| r.p_value_mixed - midp_adj).collect();
    let mut fdr = crate::toptable::p_adjust_bh(&two_sided);
    let mut fdr_mixed = crate::toptable::p_adjust_bh(&mixed);
    if midp {
        for (i, r) in rows.iter().enumerate() {
            fdr[i] = fdr[i].max(r.p_value);
            fdr_mixed[i] = fdr_mixed[i].max(r.p_value_mixed);
        }
    }
    for (r, (f, fm)) in rows.iter_mut().zip(fdr.into_iter().zip(fdr_mixed)) {
        r.fdr = f;
        r.fdr_mixed = fm;
    }

    match sort {
        FrySort::Directional => rows.sort_by(|a, b| {
            a.p_value
                .partial_cmp(&b.p_value)
                .unwrap()
                .then(
                    b.prop_up
                        .max(b.prop_down)
                        .partial_cmp(&a.prop_up.max(a.prop_down))
                        .unwrap(),
                )
                .then(b.n_genes.cmp(&a.n_genes))
                .then(a.p_value_mixed.partial_cmp(&b.p_value_mixed).unwrap())
        }),
        FrySort::Mixed => rows.sort_by(|a, b| {
            a.p_value_mixed
                .partial_cmp(&b.p_value_mixed)
                .unwrap()
                .then(
                    (b.prop_up + b.prop_down)
                        .partial_cmp(&(a.prop_up + a.prop_down))
                        .unwrap(),
                )
                .then(b.n_genes.cmp(&a.n_genes))
                .then(a.p_value.partial_cmp(&b.p_value).unwrap())
        }),
        FrySort::NoSort => {}
    }
    Ok(rows)
}

/// One row of [`romer`] output: the set size and the three rotation p-values.
#[derive(Clone, Debug)]
pub struct RomerRow {
    /// 0-based position of this set in the input `index` list.
    pub set: usize,
    /// Number of genes in the set (`NGenes`).
    pub n_genes: usize,
    /// Up-regulation p-value (`Up`): high mean rank of the moderated t.
    pub p_up: f64,
    /// Down-regulation p-value (`Down`): low mean rank of the moderated t.
    pub p_down: f64,
    /// Mixed p-value (`Mixed`): high mean rank of the absolute moderated t.
    pub p_mixed: f64,
}

/// Alternative hypothesis for [`top_romer`] (`topRomer`'s `alternative`).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum RomerAlternative {
    /// `"up"`: most up-regulated sets first.
    Up,
    /// `"down"`: most down-regulated sets first.
    Down,
    /// `"mixed"`: most differentially expressed (either direction) first.
    Mixed,
}

/// Set-level summary statistic for [`romer`] (`romer`'s `set.statistic`).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum RomerStatistic {
    /// `"mean"` (default): mean rank of the moderated t over the set.
    Mean,
    /// `"floormean"`: mean rank after flooring the statistic (separate
    /// non-negative ranks for up, down and mixed directions).
    FloorMean,
    /// `"mean50"`: mean of the more extreme half of the within-set ranks.
    Mean50,
}

/// Rotation mean-rank GSEA for linear models (`romer`).
///
/// Ports all three `set.statistic` options ([`RomerStatistic`]); array weights
/// and blocking are out of scope. `exprs` is `G x n` (genes by samples),
/// `design` is `n x p`, `coef` the 0-based contrast column, and `index` the gene
/// sets as 0-based member indices. `shrink_resid` toggles the empirical-Bayes
/// shrinkage of the contrast effect (`shrink.resid`, limma's default `true`).
///
/// `rng` is supplied already seeded by the caller. The rotation loop is the
/// test's only source of randomness, drawing `rnorm(n - p + 1)` once per
/// rotation exactly as limma does, so a bit-exact [`RRng`] reproduces limma's
/// Monte-Carlo counts. Rows are returned in input order (use [`top_romer`] to
/// rank them).
#[allow(clippy::too_many_arguments)]
pub fn romer(
    exprs: &Array2<f64>,
    design: &Array2<f64>,
    coef: usize,
    index: &[Vec<usize>],
    set_statistic: RomerStatistic,
    nrot: usize,
    shrink_resid: bool,
    rng: &mut RRng,
) -> Result<Vec<RomerRow>> {
    let g = exprs.nrows();
    let n = exprs.ncols();
    let p = design.ncols();
    let d = (n - p) as f64;
    let p0 = p - 1;
    let neff = n - p0; // d + 1: contrast effect plus residual effects.

    // Reorder so the tested contrast is last, then take QR effects (raw, with the
    // pivot sign applied explicitly at the statistic stage as romer does).
    let design = move_coef_last(design, coef);
    let qfull = qr_full_q(&design);
    let (_, rmat) = qr_econ(&design);
    let full = qfull.t().dot(&exprs.t()); // n x G = Q' t(y)
    let signc = if rmat[[p - 1, p - 1]] < 0.0 {
        -1.0
    } else {
        1.0
    };

    // Residual variance per gene and the empirical-Bayes posterior.
    let mut s2 = Array1::<f64>::zeros(g);
    for (gi, s) in s2.iter_mut().enumerate() {
        let mut acc = 0.0;
        for k in p..n {
            let e = full[[k, gi]];
            acc += e * e;
        }
        *s = acc / d;
    }
    let sv = squeeze_var(&s2, &Array1::from_elem(g, d), None, false)?;
    let d0 = sv.df_prior[0];
    let s02 = sv.var_prior[0];

    // Y (gene by effect): column 0 the contrast effect, the rest residuals. yy is
    // the per-gene sum of squares, captured before any shrinkage of column 0.
    let mut ymat = Array2::<f64>::zeros((g, neff));
    let mut yy = vec![0.0; g];
    let mut modt = vec![0.0; g];
    for gi in 0..g {
        let mut acc = 0.0;
        for k in 0..neff {
            let e = full[[p0 + k, gi]];
            ymat[[gi, k]] = e;
            acc += e * e;
        }
        yy[gi] = acc;
        modt[gi] = signc * ymat[[gi, 0]] / sv.var_post[gi].sqrt();
    }

    // Empirical-Bayes shrinkage of the contrast effect toward a residual.
    if shrink_resid {
        let pvals: Vec<f64> = modt
            .iter()
            .map(|&m| 2.0 * pt_upper(m.abs(), d0 + d))
            .collect();
        let proportion = 1.0 - prop_true_null(&pvals, PropTrueNullMethod::Lfdr, 20);
        let stdev_unscaled = 1.0 / rmat[[p - 1, p - 1]].abs();
        let var_unscaled = stdev_unscaled * stdev_unscaled;
        let df_total = d + d0;
        let var_prior_lim = (0.01 / s02, 16.0 / s02);
        let su = vec![stdev_unscaled; g];
        let dt = vec![df_total; g];
        let mut var_prior = tmixture_vector(&modt, &su, &dt, proportion, var_prior_lim);
        if var_prior.is_nan() {
            var_prior = 1.0 / s02;
        }
        let r = (var_unscaled + var_prior) / var_unscaled;
        let logodds = (proportion / (1.0 - proportion)).ln() - r.ln() / 2.0;
        for gi in 0..g {
            let t2 = modt[gi] * modt[gi];
            let kernel = if d0 > 1e6 {
                t2 * (1.0 - 1.0 / r) / 2.0
            } else {
                (1.0 + df_total) / 2.0 * ((t2 + df_total) / (t2 / r + df_total)).ln()
            };
            let lods = logodds + kernel;
            let prob_de = lods.exp() / (1.0 + lods.exp());
            ymat[[gi, 0]] *= (var_unscaled / (var_unscaled + var_prior * prob_de)).sqrt();
        }
    }

    // Observed per-set statistic `[Up, Down, Mixed]` for the chosen aggregation.
    let gf = g as f64;
    let obs = romer_set_stats(&modt, index, gf, set_statistic);

    // For "mean50" the Down statistic is the *small* half of the ranks, so a
    // rotation supports Down when it falls at or below the observed value.
    let down_low = matches!(set_statistic, RomerStatistic::Mean50);

    // Rotations: draw a unit direction in the (d+1)-dim effect space, recompute
    // the moderated t, and tally how often each rotated statistic beats observed.
    //
    // The rotation directions are the test's only randomness and must follow
    // limma's exact Mersenne-Twister order, so draw all `nrot` of them serially
    // up front (row-major, rotation `r` occupies `draws[r*neff..]`). Consuming a
    // drawn row — recomputing modt and tallying per-set hits — is a pure
    // function of that row feeding integer counters, so it parallelizes (under
    // the `parallel` feature) with a reduction that is bit-identical to the
    // serial fold and to limma.
    let mut draws = Vec::with_capacity(nrot * neff);
    for _ in 0..nrot {
        draws.extend_from_slice(&rng.rnorm(neff));
    }
    let ctx = RomerRotCtx {
        draws: &draws,
        neff,
        ymat: &ymat,
        yy: &yy,
        index,
        obs: &obs,
        gf,
        set_statistic,
        d,
        d0,
        s02,
        signc,
        g,
        down_low,
    };
    let nset = index.len();
    let count: Vec<[i64; 3]> = {
        #[cfg(feature = "parallel")]
        {
            use rayon::prelude::*;
            (0..nrot)
                .into_par_iter()
                .fold(
                    || (vec![[0i64; 3]; nset], vec![0.0f64; neff], vec![0.0f64; g]),
                    |(mut acc, mut rvec, mut modtr), r| {
                        ctx.add_rotation(r, &mut rvec, &mut modtr, &mut acc);
                        (acc, rvec, modtr)
                    },
                )
                .map(|(acc, _, _)| acc)
                .reduce(
                    || vec![[0i64; 3]; nset],
                    |mut a, b| {
                        for (x, y) in a.iter_mut().zip(&b) {
                            x[0] += y[0];
                            x[1] += y[1];
                            x[2] += y[2];
                        }
                        a
                    },
                )
        }
        #[cfg(not(feature = "parallel"))]
        {
            let mut acc = vec![[0i64; 3]; nset];
            let mut rvec = vec![0.0f64; neff];
            let mut modtr = vec![0.0f64; g];
            for r in 0..nrot {
                ctx.add_rotation(r, &mut rvec, &mut modtr, &mut acc);
            }
            acc
        }
    };

    let denom = nrot as f64 + 1.0;
    Ok(index
        .iter()
        .enumerate()
        .map(|(si, set)| RomerRow {
            set: si,
            n_genes: set.len(),
            p_up: (count[si][0] as f64 + 1.0) / denom,
            p_down: (count[si][1] as f64 + 1.0) / denom,
            p_mixed: (count[si][2] as f64 + 1.0) / denom,
        })
        .collect())
}

/// Everything a single [`romer`] rotation needs to read, so consuming a rotation
/// is a pure function of its index `r`. Holds only shared borrows and `Copy`
/// scalars, so it is `Sync` and can be shared across rayon workers; the mutable
/// per-rotation scratch (`rvec`, `modtr`) and the accumulator are passed in.
struct RomerRotCtx<'a> {
    /// All rotation directions, row-major: rotation `r` is `draws[r*neff..]`.
    draws: &'a [f64],
    neff: usize,
    /// Gene-by-effect matrix (column 0 the contrast effect, rest residuals).
    ymat: &'a Array2<f64>,
    /// Per-gene sum of squared effects (rotation-invariant total).
    yy: &'a [f64],
    index: &'a [Vec<usize>],
    /// Observed per-set `[Up, Down, Mixed]` statistic.
    obs: &'a [[f64; 3]],
    gf: f64,
    set_statistic: RomerStatistic,
    d: f64,
    d0: f64,
    s02: f64,
    signc: f64,
    g: usize,
    down_low: bool,
}

impl RomerRotCtx<'_> {
    /// Recompute the moderated t under rotation `r` and add this rotation's
    /// per-set hits into `acc`. `rvec` (len `neff`) and `modtr` (len `g`) are
    /// caller-owned scratch reused across rotations. The arithmetic — unit-norm
    /// of the drawn row, the per-gene rotated statistic, and the `>=` tallies —
    /// is identical to the serial loop, so `acc` is bit-identical regardless of
    /// how rotations are split across threads.
    fn add_rotation(&self, r: usize, rvec: &mut [f64], modtr: &mut [f64], acc: &mut [[i64; 3]]) {
        let row = &self.draws[r * self.neff..(r + 1) * self.neff];
        let mut nrm = 0.0;
        for (k, &v) in row.iter().enumerate() {
            rvec[k] = v;
            nrm += v * v;
        }
        let nrm = nrm.sqrt();
        for v in rvec.iter_mut() {
            *v /= nrm;
        }
        for (gi, m) in modtr.iter_mut().enumerate().take(self.g) {
            let mut br = 0.0;
            for (k, &rv) in rvec.iter().enumerate().take(self.neff) {
                br += rv * self.ymat[[gi, k]];
            }
            let s2r = (self.yy[gi] - br * br) / self.d;
            let sdr_post = if self.d0.is_finite() {
                ((self.d0 * self.s02 + self.d * s2r) / (self.d0 + self.d)).sqrt()
            } else {
                self.s02.sqrt()
            };
            *m = self.signc * br / sdr_post;
        }
        let rot = romer_set_stats(modtr, self.index, self.gf, self.set_statistic);
        for (c, (o, rr)) in acc.iter_mut().zip(self.obs.iter().zip(&rot)) {
            if rr[0] >= o[0] {
                c[0] += 1;
            }
            let down_hit = if self.down_low {
                rr[1] <= o[1]
            } else {
                rr[1] >= o[1]
            };
            if down_hit {
                c[1] += 1;
            }
            if rr[2] >= o[2] {
                c[2] += 1;
            }
        }
    }
}

/// Per-set mean ranks `[Up, Down, Mixed]` from a vector of statistics:
/// `Up = mean rank(stat)`, `Down = mean (N - rank(stat) + 1)`,
/// `Mixed = mean rank(|stat|)`.
fn set_mean_ranks(stat: &[f64], index: &[Vec<usize>], gf: f64) -> Vec<[f64; 3]> {
    let r = rank_average(stat);
    let abs: Vec<f64> = stat.iter().map(|v| v.abs()).collect();
    let ra = rank_average(&abs);
    index
        .iter()
        .map(|set| {
            let sz = set.len() as f64;
            let mut up = 0.0;
            let mut dn = 0.0;
            let mut mx = 0.0;
            for &gi in set {
                up += r[gi];
                dn += gf - r[gi] + 1.0;
                mx += ra[gi];
            }
            [up / sz, dn / sz, mx / sz]
        })
        .collect()
}

/// Per-set `[Up, Down, Mixed]` statistic for the chosen [`RomerStatistic`].
fn romer_set_stats(
    stat: &[f64],
    index: &[Vec<usize>],
    gf: f64,
    set_statistic: RomerStatistic,
) -> Vec<[f64; 3]> {
    match set_statistic {
        RomerStatistic::Mean => set_mean_ranks(stat, index, gf),
        RomerStatistic::FloorMean => {
            // Separate non-negative ranks per direction (limma's pmax flooring).
            let up_r = rank_average(&stat.iter().map(|&v| v.max(0.0)).collect::<Vec<_>>());
            let dn_r = rank_average(&stat.iter().map(|&v| (-v).max(0.0)).collect::<Vec<_>>());
            let mx_r = rank_average(&stat.iter().map(|&v| v.abs().max(1.0)).collect::<Vec<_>>());
            index
                .iter()
                .map(|set| {
                    let sz = set.len() as f64;
                    let mut up = 0.0;
                    let mut dn = 0.0;
                    let mut mx = 0.0;
                    for &gi in set {
                        up += up_r[gi];
                        dn += dn_r[gi];
                        mx += mx_r[gi];
                    }
                    [up / sz, dn / sz, mx / sz]
                })
                .collect()
        }
        RomerStatistic::Mean50 => {
            let r = rank_average(stat);
            let ra = rank_average(&stat.iter().map(|&v| v.abs()).collect::<Vec<_>>());
            index
                .iter()
                .map(|set| {
                    let m = set.len().div_ceil(2); // floor((|set| + 1) / 2)
                    let r_set: Vec<f64> = set.iter().map(|&gi| r[gi]).collect();
                    let ra_set: Vec<f64> = set.iter().map(|&gi| ra[gi]).collect();
                    let (small, large) = mean_half(&r_set, m);
                    let (_, large_abs) = mean_half(&ra_set, m);
                    // Up = larger half of the signed ranks, Down = smaller half,
                    // Mixed = larger half of the absolute ranks.
                    [large, small, large_abs]
                })
                .collect()
        }
    }
}

/// Mean of the smaller and larger halves of `x` (`.meanHalf`). `n` is the
/// 1-based split point `floor((len + 1) / 2)`; for odd lengths the median is
/// counted in both halves, matching limma. Returns `(small_half, large_half)`.
fn mean_half(x: &[f64], n: usize) -> (f64, f64) {
    let l = x.len();
    let mut a = x.to_vec();
    a.sort_by(|p, q| p.partial_cmp(q).unwrap());
    let small = a[..n].iter().sum::<f64>() / n as f64;
    let large = if l % 2 == 0 {
        a[n..].iter().sum::<f64>() / (l - n) as f64
    } else {
        a[(n - 1)..].iter().sum::<f64>() / (l - n + 1) as f64
    };
    (small, large)
}

/// Rank gene sets from a [`romer`] result and keep the top `n` (`topRomer`).
///
/// Mirrors `topRomer`'s ordering: by the chosen alternative's p-value, then the
/// mixed p-value (for up/down) or `min(Up, Down)` (for mixed), then descending
/// set size. Ties keep input order, matching R's stable `order`.
pub fn top_romer(rows: &[RomerRow], n: usize, alternative: RomerAlternative) -> Vec<RomerRow> {
    let mut idx: Vec<usize> = (0..rows.len()).collect();
    let key = |r: &RomerRow| match alternative {
        RomerAlternative::Up => r.p_up,
        RomerAlternative::Down => r.p_down,
        RomerAlternative::Mixed => r.p_mixed,
    };
    idx.sort_by(|&a, &b| {
        let primary = key(&rows[a]).partial_cmp(&key(&rows[b])).unwrap();
        let secondary = match alternative {
            RomerAlternative::Mixed => rows[a]
                .p_up
                .min(rows[a].p_down)
                .partial_cmp(&rows[b].p_up.min(rows[b].p_down))
                .unwrap(),
            _ => rows[a].p_mixed.partial_cmp(&rows[b].p_mixed).unwrap(),
        };
        primary
            .then(secondary)
            .then(rows[b].n_genes.cmp(&rows[a].n_genes))
    });
    idx.into_iter()
        .take(n.min(rows.len()))
        .map(|i| rows[i].clone())
        .collect()
}

/// Map a list of gene sets (each a list of identifiers) to 0-based indices into
/// `identifiers` (`ids2indices`). With `remove_empty`, sets that match nothing
/// are dropped.
pub fn ids2indices(
    gene_sets: &[Vec<String>],
    identifiers: &[String],
    remove_empty: bool,
) -> Vec<Vec<usize>> {
    let mut out = Vec::with_capacity(gene_sets.len());
    for set in gene_sets {
        let want: HashSet<&str> = set.iter().map(|s| s.as_str()).collect();
        let idx: Vec<usize> = identifiers
            .iter()
            .enumerate()
            .filter_map(|(i, id)| want.contains(id.as_str()).then_some(i))
            .collect();
        if remove_empty && idx.is_empty() {
            continue;
        }
        out.push(idx);
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    // 20 genes, mixed signs, with a tie (stats[2] == stats[10] == 1.8).
    fn fixture() -> Vec<f64> {
        vec![
            2.1, -0.5, 1.8, 0.3, -1.2, 2.5, -0.1, 1.1, -2.2, 0.7, 1.8, -0.9, 0.4, -1.5, 2.0, -0.3,
            1.3, -0.8, 0.6, -2.1,
        ]
    }

    // 1-based R index c(1,3,6,8,15,17) -> 0-based.
    fn up_set() -> Vec<usize> {
        vec![0, 2, 5, 7, 14, 16]
    }

    #[test]
    fn rank_average_handles_ties() {
        // Two tied 1.8 values share rank (their averaged position).
        let r = rank_average(&fixture());
        assert_eq!(r[2], r[10]);
    }

    #[test]
    fn gene_set_test_matches_r() {
        let stats = fixture();
        let idx = up_set();
        let cases = [
            (Alternative::Up, 0.000645718763498011),
            (Alternative::Down, 0.999517239270778),
            (Alternative::Either, 0.00129143752699602),
            (Alternative::Mixed, 0.0143292516670446),
        ];
        for (alt, want) in cases {
            let got = gene_set_test(&idx, &stats, alt);
            assert!(
                (got - want).abs() < 1e-9,
                "gene_set_test({alt:?}): got {got}, want {want}"
            );
        }
        // wilcoxGST == geneSetTest mixed.
        let w = wilcox_gst(&idx, &stats);
        assert!((w - 0.0143292516670446).abs() < 1e-9);
    }

    #[test]
    fn rank_sum_test_matches_r() {
        let stats = fixture();
        let idx = up_set();

        // correlation = 0.1, df = 10.
        let (less, greater) = rank_sum_test_with_correlation(&idx, &stats, 0.1, 10.0);
        assert!((less - 0.991665460749303).abs() < 1e-9);
        assert!((greater - 0.0094257162710415).abs() < 1e-9);

        // correlation = 0, df = Inf (normal).
        let (less, greater) = rank_sum_test_with_correlation(&idx, &stats, 0.0, f64::INFINITY);
        assert!((less - 0.999517239270778).abs() < 1e-9);
        assert!((greater - 0.000645718763498011).abs() < 1e-9);

        // A different set, correlation = 0.25, df = 18. R index c(2,5,9,14,20).
        let idx2 = [1, 4, 8, 13, 19];
        let (less, greater) = rank_sum_test_with_correlation(&idx2, &stats, 0.25, 18.0);
        assert!((less - 0.0152351621428473).abs() < 1e-9);
        assert!((greater - 0.986720588745113).abs() < 1e-9);
    }

    fn three_sets() -> Vec<Vec<usize>> {
        vec![
            vec![0, 2, 5, 7, 14, 16], // set1 = c(1,3,6,8,15,17)
            vec![1, 4, 8, 13, 19],    // set2 = c(2,5,9,14,20)
            vec![3, 6, 9, 12],        // set3 = c(4,7,10,13)
        ]
    }

    #[test]
    fn camera_pr_parametric_matches_r() {
        let stat = fixture();
        let sets = three_sets();
        let rows = camera_pr(&stat, &sets, 0.01, false, true);
        // R output order (sorted by p-value): set1, set2, set3.
        let want = [
            (
                0usize,
                6usize,
                Direction::Up,
                0.000305279883783743,
                0.000489203611923099,
            ),
            (
                1,
                5,
                Direction::Down,
                0.000326135741282066,
                0.000489203611923099,
            ),
            (2, 4, Direction::Up, 0.91114902618042, 0.91114902618042),
        ];
        assert_eq!(rows.len(), want.len());
        for (r, (set, ng, dir, p, fdr)) in rows.iter().zip(want) {
            assert_eq!(r.set, set);
            assert_eq!(r.n_genes, ng);
            assert_eq!(r.direction, dir);
            assert!(
                (r.p_value - p).abs() < 1e-9,
                "p: got {}, want {p}",
                r.p_value
            );
            assert!((r.fdr - fdr).abs() < 1e-9, "fdr: got {}, want {fdr}", r.fdr);
        }
    }

    #[test]
    fn camera_pr_use_ranks_matches_r() {
        let stat = fixture();
        let sets = three_sets();
        let rows = camera_pr(&stat, &sets, 0.01, true, true);
        let want = [
            (
                0usize,
                Direction::Up,
                0.00153858324497317,
                0.00385566266453651,
            ),
            (1, Direction::Down, 0.00257044177635767, 0.00385566266453651),
            (2, Direction::Up, 0.962711741316641, 0.962711741316641),
        ];
        for (r, (set, dir, p, fdr)) in rows.iter().zip(want) {
            assert_eq!(r.set, set);
            assert_eq!(r.direction, dir);
            assert!(
                (r.p_value - p).abs() < 1e-9,
                "p: got {}, want {p}",
                r.p_value
            );
            assert!((r.fdr - fdr).abs() < 1e-9, "fdr: got {}, want {fdr}", r.fdr);
        }
    }

    // 12 genes x 6 samples; design = model.matrix(~group), group=A,A,A,B,B,B.
    fn camera_exprs() -> Array2<f64> {
        Array2::from_shape_vec(
            (12, 6),
            vec![
                4.871, 4.629, 4.697, 5.807, 4.798, 5.195, //
                6.356, 6.349, 6.764, 4.125, 3.125, 4.752, //
                4.298, 4.659, 4.508, 5.936, 4.075, 7.367, //
                8.896, 9.420, 8.915, 9.165, 9.466, 8.598, //
                6.563, 6.610, 6.813, 6.123, 6.155, 7.309, //
                4.443, 4.283, 3.851, 5.435, 5.304, 5.784, //
                7.247, 7.184, 7.620, 6.533, 7.878, 6.820, //
                7.456, 7.644, 8.368, 9.096, 7.422, 10.245, //
                7.229, 6.945, 6.986, 8.178, 7.445, 10.159, //
                5.378, 5.177, 4.919, 7.692, 6.023, 7.432, //
                8.748, 9.133, 9.280, 9.431, 10.394, 11.954, //
                6.697, 7.010, 6.719, 4.293, 3.114, 5.796, //
            ],
        )
        .unwrap()
    }

    fn camera_design() -> Array2<f64> {
        Array2::from_shape_vec(
            (6, 2),
            vec![1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
        )
        .unwrap()
    }

    fn camera_sets() -> Vec<Vec<usize>> {
        vec![
            vec![0, 1, 2, 3],    // set1 = c(1,2,3,4)
            vec![4, 5, 6, 7, 8], // set2 = c(5,6,7,8,9)
            vec![9, 10, 11],     // set3 = c(10,11,12)
        ]
    }

    #[test]
    fn inter_gene_correlation_matches_r() {
        let (vif, cor) = inter_gene_correlation(&camera_exprs(), &camera_design());
        assert!((vif - 3.54050719052897).abs() < 1e-9, "vif: {vif}");
        assert!((cor - 0.230955199138998).abs() < 1e-9, "cor: {cor}");
    }

    #[test]
    fn camera_parametric_matches_r() {
        let rows = camera(
            &camera_exprs(),
            &camera_design(),
            1,
            &camera_sets(),
            0.01,
            false,
            true,
        )
        .unwrap();
        // R output order (sorted by p-value): set1, set2, set3.
        let want = [
            (
                0usize,
                4usize,
                Direction::Down,
                0.42121952380793,
                0.753808705609041,
            ),
            (1, 5, Direction::Up, 0.502539137072694, 0.753808705609041),
            (2, 3, Direction::Up, 0.916115986180527, 0.916115986180527),
        ];
        assert_eq!(rows.len(), want.len());
        for (r, (set, ng, dir, p, fdr)) in rows.iter().zip(want) {
            assert_eq!(r.set, set);
            assert_eq!(r.n_genes, ng);
            assert_eq!(r.direction, dir);
            assert!(
                (r.p_value - p).abs() < 1e-7,
                "p: got {}, want {p}",
                r.p_value
            );
            assert!((r.fdr - fdr).abs() < 1e-7, "fdr: got {}, want {fdr}", r.fdr);
        }
    }

    #[test]
    fn camera_use_ranks_matches_r() {
        let rows = camera(
            &camera_exprs(),
            &camera_design(),
            1,
            &camera_sets(),
            0.01,
            true,
            true,
        )
        .unwrap();
        // R output order (sorted by p-value): set1, set3, set2.
        let want = [
            (
                0usize,
                Direction::Down,
                0.354526685759271,
                0.693805951243274,
            ),
            (2, Direction::Up, 0.462537300828849, 0.693805951243274),
            (1, Direction::Up, 0.872315053291437, 0.872315053291437),
        ];
        assert_eq!(rows.len(), want.len());
        for (r, (set, dir, p, fdr)) in rows.iter().zip(want) {
            assert_eq!(r.set, set);
            assert_eq!(r.direction, dir);
            assert!(
                (r.p_value - p).abs() < 1e-7,
                "p: got {}, want {p}",
                r.p_value
            );
            assert!((r.fdr - fdr).abs() < 1e-7, "fdr: got {}, want {fdr}", r.fdr);
        }
    }

    #[test]
    fn fry_matches_r() {
        let rows = fry(
            &camera_exprs(),
            &camera_design(),
            1,
            &camera_sets(),
            FrySort::Directional,
        )
        .unwrap();
        // R output order (directional sort): set2, set1, set3.
        let want = [
            (
                1usize,
                5usize,
                Direction::Up,
                0.124433966893834,
                0.373301900681503,
                0.0667070665318511,
                0.0667070665318511,
            ),
            (
                0,
                4,
                Direction::Down,
                0.44028222758847,
                0.45071371571786,
                0.00113516116620128,
                0.00170274174930192,
            ),
            (
                2,
                3,
                Direction::Up,
                0.45071371571786,
                0.45071371571786,
                0.000139022937183932,
                0.000417068811551796,
            ),
        ];
        assert_eq!(rows.len(), want.len());
        for (r, (set, ng, dir, p, fdr, pm, fdrm)) in rows.iter().zip(want) {
            assert_eq!(r.set, set);
            assert_eq!(r.n_genes, ng);
            assert_eq!(r.direction, dir);
            assert!(
                (r.p_value - p).abs() < 1e-6,
                "p: got {}, want {p}",
                r.p_value
            );
            assert!((r.fdr - fdr).abs() < 1e-6, "fdr: got {}, want {fdr}", r.fdr);
            assert!(
                (r.p_value_mixed - pm).abs() < 1e-6,
                "pm: got {}, want {pm}",
                r.p_value_mixed
            );
            assert!(
                (r.fdr_mixed - fdrm).abs() < 1e-6,
                "fdrm: got {}, want {fdrm}",
                r.fdr_mixed
            );
        }
    }

    #[test]
    fn ids2indices_maps_and_drops_empty() {
        let ids: Vec<String> = ["a", "b", "c", "d", "e"]
            .iter()
            .map(|s| s.to_string())
            .collect();
        let sets = vec![
            vec!["b".to_string(), "d".to_string()],
            vec!["x".to_string()],
            vec!["a".to_string(), "e".to_string(), "c".to_string()],
        ];
        let with_empty = ids2indices(&sets, &ids, false);
        assert_eq!(with_empty, vec![vec![1, 3], vec![], vec![0, 2, 4]]);
        let without = ids2indices(&sets, &ids, true);
        assert_eq!(without, vec![vec![1, 3], vec![0, 2, 4]]);
    }

    #[test]
    #[allow(clippy::excessive_precision)]
    fn roast_matches_r() {
        // y = matrix(rnorm(50*6), 50, 6) after set.seed(2024). RRng is bit-exact
        // to R's rnorm, so regenerating here yields the same matrix R's reference
        // used (column-major fill, matching R's matrix() storage order).
        let g = 50usize;
        let n = 6usize;
        let y_data = RRng::new(2024).rnorm(g * n);
        let y = Array2::from_shape_vec((n, g), y_data)
            .unwrap()
            .t()
            .to_owned();

        // design = cbind(Intercept=1, Group=c(0,0,0,1,1,1)); contrast=2 -> coef=1.
        let design = Array2::from_shape_vec(
            (2, n),
            vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0],
        )
        .unwrap()
        .t()
        .to_owned();

        let check = |tag: &str,

                     seed: i32,
                     index: &[usize],
                     nrot: usize,
                     want_active: [f64; 4],
                     want_p: [f64; 4]| {
            let mut rng = RRng::new(seed);
            let out = roast(&y, &design, 1, index, nrot, &mut rng).unwrap();
            assert_eq!(out.n_genes_in_set, index.len());
            for i in 0..4 {
                assert!(
                    (out.active_prop[i] - want_active[i]).abs() < 1e-12,
                    "{tag} active[{i}]: got {}, want {}",
                    out.active_prop[i],
                    want_active[i]
                );
                // Counts are integers; if they match R the p-value is bit-exact.
                // A loose tolerance here would still flag any single-count drift
                // (which moves a p-value by ~1/nrot, far above this threshold).
                assert!(
                    (out.p_value[i] - want_p[i]).abs() < 1e-12,
                    "{tag} p[{i}]: got {}, want {}",
                    out.p_value[i],
                    want_p[i]
                );
            }
        };

        let idx_a: Vec<usize> = (0..10).collect();
        check(
            "A",
            99,
            &idx_a,
            1999,
            [0.1, 0.0, 0.1, 0.1],
            [0.47211802950737686, 0.52813203300825207, 0.944, 0.344],
        );

        let idx_b: Vec<usize> = (10..35).collect();
        check(
            "B",
            7,
            &idx_b,
            1999,
            [0.0, 0.12, 0.12, 0.12],
            [0.90547636909227303, 0.094773693423355843, 0.1895, 0.384],
        );

        check(
            "C",
            123,
            &idx_a,
            999,
            [0.1, 0.0, 0.1, 0.1],
            [0.47423711855927964, 0.52626313156578286, 0.948, 0.364],
        );
    }

    #[test]
    #[allow(clippy::excessive_precision)]
    fn mroast_matches_r() {
        // Same y/design fixture as roast_matches_r (set.seed(2024)); reference
        // from scratch/mroast_ref.R with seed 314, nrot 1999, midp = TRUE.
        let g = 50usize;
        let n = 6usize;
        let y = Array2::from_shape_vec((n, g), RRng::new(2024).rnorm(g * n))
            .unwrap()
            .t()
            .to_owned();
        let design = Array2::from_shape_vec(
            (2, n),
            vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0],
        )
        .unwrap()
        .t()
        .to_owned();

        // index = list(S1=1:10, S2=11:35, S3=c(5:15,40:50), S4=20:24) (0-based).
        let index: Vec<Vec<usize>> = vec![
            (0..10).collect(),
            (10..35).collect(),
            (4..15).chain(39..50).collect(),
            (19..24).collect(),
        ];

        // sort = "none": validate every per-set number in input order.
        let mut rng = RRng::new(314);
        let tab = mroast(
            &y,
            &design,
            1,
            &index,
            1999,
            true,
            FrySort::NoSort,
            &mut rng,
        )
        .unwrap();
        let want_ngenes = [10usize, 25, 22, 5];
        let want_propdown = [0.1, 0.0, 0.090909090909090912, 0.0];
        let want_propup = [0.0, 0.12, 0.090909090909090912, 0.4];
        let want_dir = [
            Direction::Down,
            Direction::Up,
            Direction::Down,
            Direction::Up,
        ];
        let want_p = [0.943, 0.1875, 0.6645, 0.124];
        let want_fdr = [0.943, 0.3745, 0.8856666666666666, 0.3745];
        let want_pm = [0.348, 0.403, 0.5915, 0.1265];
        let want_fdrm = [0.537, 0.537, 0.5915, 0.505];
        for i in 0..4 {
            assert_eq!(tab[i].set, i, "row {i} set");
            assert_eq!(tab[i].n_genes, want_ngenes[i], "row {i} ngenes");
            assert_eq!(tab[i].direction, want_dir[i], "row {i} direction");
            assert!(
                (tab[i].prop_down - want_propdown[i]).abs() < 1e-12,
                "row {i} propdown: got {}, want {}",
                tab[i].prop_down,
                want_propdown[i]
            );
            assert!(
                (tab[i].prop_up - want_propup[i]).abs() < 1e-12,
                "row {i} propup: got {}, want {}",
                tab[i].prop_up,
                want_propup[i]
            );
            assert!(
                (tab[i].p_value - want_p[i]).abs() < 1e-12,
                "row {i} pvalue: got {}, want {}",
                tab[i].p_value,
                want_p[i]
            );
            assert!(
                (tab[i].fdr - want_fdr[i]).abs() < 1e-12,
                "row {i} fdr: got {}, want {}",
                tab[i].fdr,
                want_fdr[i]
            );
            assert!(
                (tab[i].p_value_mixed - want_pm[i]).abs() < 1e-12,
                "row {i} pvalue_mixed: got {}, want {}",
                tab[i].p_value_mixed,
                want_pm[i]
            );
            assert!(
                (tab[i].fdr_mixed - want_fdrm[i]).abs() < 1e-12,
                "row {i} fdr_mixed: got {}, want {}",
                tab[i].fdr_mixed,
                want_fdrm[i]
            );
        }

        // sort = "directional": rows ordered S4,S2,S3,S1.
        let mut rng = RRng::new(314);
        let td = mroast(
            &y,
            &design,
            1,
            &index,
            1999,
            true,
            FrySort::Directional,
            &mut rng,
        )
        .unwrap();
        let order_d: Vec<usize> = td.iter().map(|r| r.set).collect();
        assert_eq!(order_d, vec![3, 1, 2, 0], "directional order");

        // sort = "mixed": rows ordered S4,S1,S2,S3.
        let mut rng = RRng::new(314);
        let tm = mroast(&y, &design, 1, &index, 1999, true, FrySort::Mixed, &mut rng).unwrap();
        let order_m: Vec<usize> = tm.iter().map(|r| r.set).collect();
        assert_eq!(order_m, vec![3, 0, 1, 2], "mixed order");
    }

    #[test]
    #[allow(clippy::excessive_precision)]
    fn romer_matches_r() {
        // Same y/design fixture as roast_matches_r (set.seed(2024)); reference
        // from scratch/romer_ref.R with seed 271, nrot 999, shrink.resid = TRUE.
        let g = 50usize;
        let n = 6usize;
        let y = Array2::from_shape_vec((n, g), RRng::new(2024).rnorm(g * n))
            .unwrap()
            .t()
            .to_owned();
        let design = Array2::from_shape_vec(
            (2, n),
            vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0],
        )
        .unwrap()
        .t()
        .to_owned();

        // index = list(S1=1:10, S2=11:35, S3=c(5:15,40:50), S4=20:24) (0-based).
        let index: Vec<Vec<usize>> = vec![
            (0..10).collect(),
            (10..35).collect(),
            (4..15).chain(39..50).collect(),
            (19..24).collect(),
        ];

        let mut rng = RRng::new(271);
        let tab = romer(
            &y,
            &design,
            1,
            &index,
            RomerStatistic::Mean,
            999,
            true,
            &mut rng,
        )
        .unwrap();
        let want_ngenes = [10usize, 25, 22, 5];
        let want_up = [0.563, 0.476, 0.987, 0.492];
        let want_down = [0.441, 0.527, 0.016, 0.526];
        let want_mixed = [0.231, 0.341, 0.632, 0.139];
        for i in 0..4 {
            assert_eq!(tab[i].set, i, "row {i} set");
            assert_eq!(tab[i].n_genes, want_ngenes[i], "row {i} ngenes");
            // Counts are integers; a bit-exact p-value implies the counts match.
            assert!(
                (tab[i].p_up - want_up[i]).abs() < 1e-12,
                "row {i} up: got {}, want {}",
                tab[i].p_up,
                want_up[i]
            );
            assert!(
                (tab[i].p_down - want_down[i]).abs() < 1e-12,
                "row {i} down: got {}, want {}",
                tab[i].p_down,
                want_down[i]
            );
            assert!(
                (tab[i].p_mixed - want_mixed[i]).abs() < 1e-12,
                "row {i} mixed: got {}, want {}",
                tab[i].p_mixed,
                want_mixed[i]
            );
        }

        // topRomer orderings (S1..S4 -> 0..3).
        let order = |a: RomerAlternative| -> Vec<usize> {
            top_romer(&tab, 4, a).iter().map(|r| r.set).collect()
        };
        assert_eq!(order(RomerAlternative::Up), vec![1, 3, 0, 2], "top up");
        assert_eq!(order(RomerAlternative::Down), vec![2, 0, 3, 1], "top down");
        assert_eq!(
            order(RomerAlternative::Mixed),
            vec![3, 0, 1, 2],
            "top mixed"
        );

        // Non-default set.statistic options on the same fixture/seed; reference
        // integer rotation counts from scratch/romer_stats_ref.R. The p-value is
        // (count + 1) / (nrot + 1), so matching p implies matching counts.
        let check_stat =
            |stat: RomerStatistic, up: [i64; 4], down: [i64; 4], mixed: [i64; 4], tag: &str| {
                let mut rng = RRng::new(271);
                let t = romer(&y, &design, 1, &index, stat, 999, true, &mut rng).unwrap();
                let p = |c: i64| (c as f64 + 1.0) / 1000.0;
                for i in 0..4 {
                    assert!((t[i].p_up - p(up[i])).abs() < 1e-12, "{tag} row {i} up");
                    assert!(
                        (t[i].p_down - p(down[i])).abs() < 1e-12,
                        "{tag} row {i} down"
                    );
                    assert!(
                        (t[i].p_mixed - p(mixed[i])).abs() < 1e-12,
                        "{tag} row {i} mixed"
                    );
                }
            };
        check_stat(
            RomerStatistic::FloorMean,
            [426, 505, 925, 452],
            [477, 367, 121, 169],
            [900, 366, 666, 201],
            "floormean",
        );
        check_stat(
            RomerStatistic::Mean50,
            [479, 351, 935, 329],
            [361, 396, 109, 165],
            [690, 246, 707, 169],
            "mean50",
        );
    }

    #[test]
    #[allow(clippy::excessive_precision)]
    fn contrast_as_coef_matches_r() {
        // design = cbind(A = 1, B = c(0,0,1,1,0,0), C = c(0,0,0,0,1,1)), 6 x 3.
        let design = Array2::from_shape_vec(
            (3, 6),
            vec![
                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // A
                0.0, 0.0, 1.0, 1.0, 0.0, 0.0, // B
                0.0, 0.0, 0.0, 0.0, 1.0, 1.0, // C
            ],
        )
        .unwrap()
        .t()
        .to_owned();

        // Compare reformed design (column-major), coef and rank against limma.
        let check = |out: &ContrastAsCoef, rank: usize, coef: &[usize], cm: &[f64]| {
            assert_eq!(out.rank, rank, "rank");
            assert_eq!(out.coef.as_slice(), coef, "coef");
            let n = out.design.nrows();
            let p = out.design.ncols();
            let mut flat = Vec::with_capacity(n * p);
            for j in 0..p {
                for i in 0..n {
                    flat.push(out.design[[i, j]]);
                }
            }
            assert_eq!(flat.len(), cm.len(), "design length");
            for (idx, (&a, &b)) in flat.iter().zip(cm).enumerate() {
                assert!((a - b).abs() < 1e-12, "design[{idx}]: {a} vs {b}");
            }
        };

        // Single contrast B - C.
        let v1 = Array2::from_shape_vec((3, 1), vec![0.0, 1.0, -1.0]).unwrap();
        check(
            &contrast_as_coef(&design, &v1, true).unwrap(),
            1,
            &[0],
            &[
                0.0,
                0.0,
                0.49999999999999994,
                0.49999999999999994,
                -0.49999999999999994,
                -0.49999999999999994,
                -0.70710678118654746,
                -0.70710678118654746,
                -0.20710678118654746,
                -0.20710678118654746,
                -0.20710678118654754,
                -0.20710678118654754,
                0.70710678118654746,
                0.70710678118654746,
                1.2071067811865475,
                1.2071067811865475,
                1.2071067811865475,
                1.2071067811865475,
            ],
        );
        check(
            &contrast_as_coef(&design, &v1, false).unwrap(),
            1,
            &[2],
            &[
                -0.70710678118654746,
                -0.70710678118654746,
                -0.20710678118654746,
                -0.20710678118654746,
                -0.20710678118654754,
                -0.20710678118654754,
                0.70710678118654746,
                0.70710678118654746,
                1.2071067811865475,
                1.2071067811865475,
                1.2071067811865475,
                1.2071067811865475,
                0.0,
                0.0,
                0.49999999999999994,
                0.49999999999999994,
                -0.49999999999999994,
                -0.49999999999999994,
            ],
        );

        // Two-column full-rank contrast: (B-C) and (B+C-2A).
        let m2 = Array2::from_shape_vec((3, 2), vec![0.0, -2.0, 1.0, 1.0, -1.0, 1.0]).unwrap();
        check(
            &contrast_as_coef(&design, &m2, true).unwrap(),
            2,
            &[0, 1],
            &[
                0.0,
                0.0,
                0.49999999999999994,
                0.49999999999999994,
                -0.49999999999999994,
                -0.49999999999999994,
                -0.33333333333333337,
                -0.33333333333333337,
                -0.16666666666666663,
                -0.16666666666666663,
                -0.16666666666666669,
                -0.16666666666666669,
                0.57735026918962573,
                0.57735026918962573,
                1.1547005383792515,
                1.1547005383792515,
                1.1547005383792515,
                1.1547005383792515,
            ],
        );
        check(
            &contrast_as_coef(&design, &m2, false).unwrap(),
            2,
            &[1, 2],
            &[
                0.57735026918962573,
                0.57735026918962573,
                1.1547005383792515,
                1.1547005383792515,
                1.1547005383792515,
                1.1547005383792515,
                0.0,
                0.0,
                0.49999999999999994,
                0.49999999999999994,
                -0.49999999999999994,
                -0.49999999999999994,
                -0.33333333333333337,
                -0.33333333333333337,
                -0.16666666666666663,
                -0.16666666666666663,
                -0.16666666666666669,
                -0.16666666666666669,
            ],
        );
    }
}