gsva-rust 0.1.0

Pure-Rust port of the GSVA family of gene-set enrichment methods (GSVA, ssGSEA, z-score, PLAGE), validated for numeric parity against the Bioconductor GSVA package.
Documentation
//! PLAGE — Pathway Level Analysis of Gene Expression (Tomfohr *et al.* 2005), as
//! implemented by GSVA's `plage` path.
//!
//! Rows are standardized (mean 0, unit sample SD), and each gene set's per-sample
//! score vector is the **first right-singular vector** of the standardized
//! submatrix `Z[geneSet, ]` — the set's dominant "metagene".
//!
//! ## Sign convention
//!
//! A singular vector is only defined up to sign: `v` and `-v` are both valid. The
//! Bioconductor GSVA package returns whatever sign LAPACK's `dgesdd` happens to
//! produce, which is an internal artifact of its divide-and-conquer
//! bidiagonalization and is **not reproducible** from the input alone (verified:
//! no deterministic function of the input matrix predicts it). `gsva-rust` instead
//! fixes a stable, documented convention — the entry of largest magnitude is made
//! positive — so results are deterministic across runs and platforms. PLAGE scores
//! therefore match GSVA *up to a per-gene-set sign* (the intrinsic PLAGE
//! ambiguity); the metagene direction and all magnitudes agree to full precision.

use crate::matrix::ExprMatrix;
use crate::prep;
use crate::result::EnrichmentResult;

/// Parameters for PLAGE. Defaults mirror GSVA `plageParam`: `min_size = 1`,
/// `max_size = unbounded`.
#[derive(Clone, Debug)]
pub struct PlageParams {
    /// Minimum number of matched genes for a set to be scored.
    pub min_size: usize,
    /// Maximum number of matched genes for a set to be scored.
    pub max_size: usize,
}

impl Default for PlageParams {
    fn default() -> Self {
        PlageParams {
            min_size: 1,
            max_size: usize::MAX,
        }
    }
}

/// Compute PLAGE enrichment scores.
///
/// Mirrors `gsva(plageParam(expr, geneSets, minSize, maxSize))`: constant rows are
/// dropped, gene sets are mapped to the remaining rows and size-filtered, rows are
/// centered/scaled, and each set's score is the first right-singular vector of its
/// standardized submatrix (sign-canonicalized; see the module docs).
///
/// Panics on the conditions R aborts on (no genes matched, no set left, or
/// duplicated surviving set names).
pub fn plage(
    expr: &ExprMatrix,
    gene_sets: &crate::geneset::GeneSets,
    params: &PlageParams,
) -> EnrichmentResult {
    let filtered = prep::filter_constant_rows(expr);
    let (names, indices) =
        prep::map_and_filter_sets(&filtered, gene_sets, params.min_size, params.max_size);
    let z = prep::scale_rows(&filtered);
    let n = filtered.ncol();

    let mut scores = vec![0.0f64; names.len() * n];
    // Each gene set's metagene (first right-singular vector) is independent —
    // parallelize over sets; the per-set Jacobi SVD is byte-for-byte unchanged.
    crate::par::fill_chunks_mut(&mut scores, n, |s, out| {
        let idx = &indices[s];
        let k = idx.len();
        // Gather the set's standardized submatrix A = Z[idx, ] as k x n row-major.
        let mut a = vec![0.0f64; k * n];
        for (r, &g) in idx.iter().enumerate() {
            a[r * n..(r + 1) * n].copy_from_slice(&z[g * n..(g + 1) * n]);
        }
        let v1 = leading_right_singular_vector(&a, k, n);
        out.copy_from_slice(&v1);
    });

    EnrichmentResult {
        gene_sets: names,
        samples: filtered.col_names().to_vec(),
        scores,
    }
}

/// First right-singular vector of a `k x n` (row-major) matrix `a`, returned as a
/// length-`n` unit vector with its largest-magnitude entry made positive.
///
/// The default `faer` build dispatches to faer's pure-Rust (SIMD, no BLAS/LAPACK)
/// SVD. faer sorts singular values in nonincreasing order, so the leading
/// right-singular vector is column 0 of `V`.
#[cfg(feature = "faer")]
fn leading_right_singular_vector(a: &[f64], k: usize, n: usize) -> Vec<f64> {
    use faer::Mat;
    let mat = Mat::from_fn(k, n, |i, j| a[i * n + j]);
    let svd = mat.svd().expect("faer SVD failed to converge");
    let v = svd.V();
    let mut v1: Vec<f64> = (0..n).map(|i| v[(i, 0)]).collect();
    canonicalize_sign(&mut v1);
    v1
}

/// First right-singular vector of a `k x n` (row-major) matrix `a`, returned as a
/// length-`n` unit vector with its largest-magnitude entry made positive.
///
/// The `--no-default-features` (zero-dependency) build uses one-sided (Hestenes)
/// Jacobi: Jacobi rotations orthogonalize the columns of `A` while the same
/// rotations accumulate into `V`. At convergence the column norms are the singular
/// values and the columns of `V` are the right-singular vectors. One-sided Jacobi
/// computes singular vectors to high relative accuracy without forming `AᵀA`, so
/// magnitudes track LAPACK closely.
#[cfg(not(feature = "faer"))]
fn leading_right_singular_vector(a: &[f64], k: usize, n: usize) -> Vec<f64> {
    let mut acol = a.to_vec(); // working copy of A's columns (k x n row-major)
    let mut v = vec![0.0f64; n * n]; // accumulated right factor, init identity
    for i in 0..n {
        v[i * n + i] = 1.0;
    }

    let max_sweeps = 60;
    for _ in 0..max_sweeps {
        let mut rotated = false;
        for p in 0..n {
            for q in (p + 1)..n {
                let mut alpha = 0.0f64; // <col p, col p>
                let mut beta = 0.0f64; // <col q, col q>
                let mut gamma = 0.0f64; // <col p, col q>
                for i in 0..k {
                    let ap = acol[i * n + p];
                    let aq = acol[i * n + q];
                    alpha += ap * ap;
                    beta += aq * aq;
                    gamma += ap * aq;
                }
                // Columns already orthogonal (to working precision): skip.
                if gamma == 0.0 || gamma.abs() <= 1e-15 * (alpha * beta).sqrt() {
                    continue;
                }
                rotated = true;
                let zeta = (beta - alpha) / (2.0 * gamma);
                let sign_zeta = if zeta >= 0.0 { 1.0 } else { -1.0 };
                let t = sign_zeta / (zeta.abs() + (1.0 + zeta * zeta).sqrt());
                let c = 1.0 / (1.0 + t * t).sqrt();
                let s = c * t;
                for i in 0..k {
                    let ap = acol[i * n + p];
                    let aq = acol[i * n + q];
                    acol[i * n + p] = c * ap - s * aq;
                    acol[i * n + q] = s * ap + c * aq;
                }
                for i in 0..n {
                    let vp = v[i * n + p];
                    let vq = v[i * n + q];
                    v[i * n + p] = c * vp - s * vq;
                    v[i * n + q] = s * vp + c * vq;
                }
            }
        }
        if !rotated {
            break;
        }
    }

    // Largest singular value = largest column norm of the orthogonalized A.
    let mut best = 0usize;
    let mut best_norm = -1.0f64;
    for j in 0..n {
        let mut nrm = 0.0f64;
        for i in 0..k {
            let x = acol[i * n + j];
            nrm += x * x;
        }
        if nrm > best_norm {
            best_norm = nrm;
            best = j;
        }
    }
    let mut v1: Vec<f64> = (0..n).map(|i| v[i * n + best]).collect();
    canonicalize_sign(&mut v1);
    v1
}

/// Deterministic PLAGE sign convention: make the largest-magnitude entry of `v1`
/// positive. A singular vector is defined only up to sign, so both backends apply
/// the same rule to agree with each other (and to be reproducible across runs).
fn canonicalize_sign(v1: &mut [f64]) {
    let mut imax = 0usize;
    let mut vmax = 0.0f64;
    for (i, &val) in v1.iter().enumerate() {
        if val.abs() > vmax {
            vmax = val.abs();
            imax = i;
        }
    }
    if v1[imax] < 0.0 {
        for x in v1.iter_mut() {
            *x = -*x;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// The SVD backend (faer by default, Jacobi with `--no-default-features`) must
    /// recover a known right-singular vector. For a rank-1 matrix `A = u wᵀ`, the
    /// first right-singular vector is `w / ‖w‖`.
    #[test]
    fn rank_one_right_singular_vector() {
        // u = (1, 2, 3)ᵀ, w = (2, 1) -> A is 3 x 2.
        let w = [2.0f64, 1.0];
        let u = [1.0f64, 2.0, 3.0];
        let (k, n) = (3usize, 2usize);
        let mut a = vec![0.0f64; k * n];
        for (i, &ui) in u.iter().enumerate() {
            for (j, &wj) in w.iter().enumerate() {
                a[i * n + j] = ui * wj;
            }
        }
        let v1 = leading_right_singular_vector(&a, k, n);
        let wn = (w[0] * w[0] + w[1] * w[1]).sqrt();
        let expect = [w[0] / wn, w[1] / wn]; // largest entry (index 0) already positive
        assert!((v1[0] - expect[0]).abs() < 1e-12, "v1={v1:?}");
        assert!((v1[1] - expect[1]).abs() < 1e-12, "v1={v1:?}");
        // Unit norm.
        let nrm = (v1[0] * v1[0] + v1[1] * v1[1]).sqrt();
        assert!((nrm - 1.0).abs() < 1e-12);
    }
}