gsva-rust 0.1.0

Pure-Rust port of the GSVA family of gene-set enrichment methods (GSVA, ssGSEA, z-score, PLAGE), validated for numeric parity against the Bioconductor GSVA package.
Documentation
//! Combined z-score gene-set enrichment (Lee *et al.* 2008), as implemented by
//! GSVA's `zscore` path.
//!
//! Each gene is standardized across samples to mean 0 and unit sample standard
//! deviation; a gene set's per-sample score is the sum of its members'
//! standardized values divided by the square root of the set size.

use crate::matrix::ExprMatrix;
use crate::prep;
use crate::result::EnrichmentResult;

/// Parameters for the z-score method. Defaults mirror GSVA `zscoreParam`:
/// `min_size = 1`, `max_size = unbounded`.
#[derive(Clone, Debug)]
pub struct ZscoreParams {
    /// Minimum number of matched genes for a set to be scored.
    pub min_size: usize,
    /// Maximum number of matched genes for a set to be scored.
    pub max_size: usize,
}

impl Default for ZscoreParams {
    fn default() -> Self {
        ZscoreParams {
            min_size: 1,
            max_size: usize::MAX,
        }
    }
}

/// Compute combined z-score enrichment scores.
///
/// Mirrors `gsva(zscoreParam(expr, geneSets, minSize, maxSize))`: constant rows
/// are dropped, gene sets are mapped to the remaining rows and size-filtered,
/// rows are centered/scaled, and each set score is `colSums(Z[idx, ]) / sqrt(k)`.
///
/// Panics on the conditions R aborts on (no genes matched, no set left, or
/// duplicated surviving set names).
pub fn zscore(
    expr: &ExprMatrix,
    gene_sets: &crate::geneset::GeneSets,
    params: &ZscoreParams,
) -> EnrichmentResult {
    let filtered = prep::filter_constant_rows(expr);
    let (names, indices) =
        prep::map_and_filter_sets(&filtered, gene_sets, params.min_size, params.max_size);

    let z = prep::scale_rows(&filtered);
    let n = filtered.ncol();

    let mut scores = vec![0.0f64; names.len() * n];
    // Each gene set's score row is independent — parallelize over sets; the
    // per-sample compensated sum below is byte-for-byte unchanged.
    crate::par::fill_chunks_mut(&mut scores, n, |s, out| {
        let idx = &indices[s];
        let denom = (idx.len() as f64).sqrt();
        for (j, slot) in out.iter_mut().enumerate() {
            // Compensated (Neumaier) sum in mapped-gene order, matching R's
            // `colSums(Z[gSetIdx, ])`: R accumulates in long double, so the
            // compensation keeps the f64 sum within ~1 ULP of the reference.
            let mut sum = 0.0f64;
            let mut comp = 0.0f64;
            for &g in idx {
                let v = z[g * n + j];
                let t = sum + v;
                if sum.abs() >= v.abs() {
                    comp += (sum - t) + v;
                } else {
                    comp += (v - t) + sum;
                }
                sum = t;
            }
            *slot = (sum + comp) / denom;
        }
    });

    EnrichmentResult {
        gene_sets: names,
        samples: filtered.col_names().to_vec(),
        scores,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geneset::{GeneSet, GeneSets};

    #[test]
    fn small_zscore_shapes() {
        // 4 genes x 3 samples, one gene set of 2 genes.
        let expr = ExprMatrix::new(
            vec!["G1".into(), "G2".into(), "G3".into(), "G4".into()],
            vec!["S1".into(), "S2".into(), "S3".into()],
            vec![
                1.0, 2.0, 3.0, // G1
                4.0, 6.0, 8.0, // G2
                3.0, 2.0, 1.0, // G3
                10.0, 5.0, 1.0, // G4
            ],
        );
        let gs = GeneSets::new(vec![GeneSet::new("SET", vec!["G1".into(), "G2".into()])]);
        let res = zscore(&expr, &gs, &ZscoreParams::default());
        assert_eq!(res.gene_sets, ["SET"]);
        assert_eq!(res.samples, ["S1", "S2", "S3"]);
        assert_eq!(res.scores.len(), 3);
        // G1 and G2 are both increasing across samples, so standardized values
        // are symmetric and the set score should be negative, ~0, positive.
        assert!(res.score(0, 0) < 0.0);
        assert!(res.score(0, 2) > 0.0);
        assert!(res.score(0, 1).abs() < 1e-12);
    }
}