gsva-rust 0.1.0

Pure-Rust port of the GSVA family of gene-set enrichment methods (GSVA, ssGSEA, z-score, PLAGE), validated for numeric parity against the Bioconductor GSVA package.
Documentation
//! Single-sample GSEA (Barbie *et al.* 2009), as implemented by GSVA's `ssgsea`
//! path.
//!
//! Within each sample genes are ranked (average ties), the ranks are truncated
//! to integers and raised to the power `alpha`, and each gene set's score is the
//! difference between the rank-weighted in-set and out-of-set step CDFs of a
//! Kolmogorov–Smirnov-like walk. With `normalize = true` the whole score matrix
//! is divided by its global range, as Barbie *et al.* prescribe.

use crate::matrix::ExprMatrix;
use crate::par;
use crate::prep;
use crate::rank;
use crate::result::EnrichmentResult;

/// Parameters for ssGSEA. Defaults mirror GSVA `ssgseaParam`: `min_size = 1`,
/// `max_size = unbounded`, `alpha = 0.25`, `normalize = true`.
#[derive(Clone, Debug)]
pub struct SsgseaParams {
    /// Minimum number of matched genes for a set to be scored.
    pub min_size: usize,
    /// Maximum number of matched genes for a set to be scored.
    pub max_size: usize,
    /// Exponent applied to the integer ranks (`Ra = R^alpha`). GSVA default 0.25.
    pub alpha: f64,
    /// Divide the whole score matrix by its global `max - min` (Barbie 2009).
    pub normalize: bool,
}

impl Default for SsgseaParams {
    fn default() -> Self {
        SsgseaParams {
            min_size: 1,
            max_size: usize::MAX,
            alpha: 0.25,
            normalize: true,
        }
    }
}

/// Compute single-sample GSEA enrichment scores.
///
/// Mirrors `gsva(ssgseaParam(expr, geneSets, minSize, maxSize, alpha, normalize))`.
/// Unlike z-score/PLAGE, ssGSEA does **not** drop constant rows: every gene
/// participates in the per-sample ranking, so `n` is the full feature count.
///
/// Panics on the conditions R aborts on (no genes matched, no set left, or
/// duplicated surviving set names).
pub fn ssgsea(
    expr: &ExprMatrix,
    gene_sets: &crate::geneset::GeneSets,
    params: &SsgseaParams,
) -> EnrichmentResult {
    // ssGSEA ranks the full matrix (removeConstant = FALSE in GSVA): no row
    // filtering, so map gene sets directly onto the input features.
    let (names, indices) =
        prep::map_and_filter_sets(expr, gene_sets, params.min_size, params.max_size);

    let p = expr.nrow(); // number of genes = walk length `n` in GSVA
    let nsamp = expr.ncol();
    let pf = p as f64;
    // Sum over all positions of (n - pos + 1), i.e. n*(n+1)/2; exact in f64 for
    // realistic gene counts (< 2^26).
    let total_weight = pf * (pf + 1.0) / 2.0;
    let unit_alpha = params.alpha == 1.0;

    let nsets = names.len();
    // Each sample is independent: rank its column once (average ties, truncated to
    // integers), raise to `alpha`, then score every gene set. Parallelize over
    // samples — each produces one score column (gene sets for that sample), in
    // sample order — then scatter into the row-major matrix. All ranking/position
    // scratch is per-sample, so nothing is shared mutably and the output is
    // bit-identical to the serial build and to R.
    let columns: Vec<Vec<f64>> = par::map_collect(nsamp, |j| {
        // Column j: per-gene values, then average ranks truncated toward zero to
        // integers (GSVA: `colRanks(ties.method="average")` then `type<-int`).
        let col: Vec<f64> = (0..p).map(|i| expr.get(i, j)).collect();
        let ranks = rank::rank_average(&col);
        let r_int: Vec<f64> = ranks.iter().map(|&r| r.trunc()).collect();
        // Ra = R^alpha (R raised element-wise; `alpha == 1` leaves R unchanged).
        let ra: Vec<f64> = if unit_alpha {
            r_int.clone()
        } else {
            r_int.iter().map(|&r| r.powf(params.alpha)).collect()
        };

        // Descending rank order and the 1-based position of each gene within it.
        let order_desc = rank::order_decreasing(&r_int);
        let mut pos = vec![0usize; p];
        for (rank_pos, &g) in order_desc.iter().enumerate() {
            pos[g] = rank_pos + 1;
        }

        indices
            .iter()
            .map(|idx| {
                let k = idx.len();
                // Accumulate in mapped-gene order (R's `Ra[geneRanking[gSetIdx], j]`)
                // with compensated (Neumaier) sums, matching R's long-double `sum`.
                let mut num_sum = 0.0f64; // Σ Ra_m * (n - pos_m + 1)
                let mut num_comp = 0.0f64;
                let mut den_sum = 0.0f64; // Σ Ra_m
                let mut den_comp = 0.0f64;
                let mut wsum = 0.0f64; // Σ (n - pos_m + 1)  (exact integers)
                for &g in idx {
                    let weight = (p - pos[g] + 1) as f64;
                    let ra_g = ra[g];
                    let term = ra_g * weight;
                    // num += term
                    let t = num_sum + term;
                    if num_sum.abs() >= term.abs() {
                        num_comp += (num_sum - t) + term;
                    } else {
                        num_comp += (term - t) + num_sum;
                    }
                    num_sum = t;
                    // den += ra_g
                    let t = den_sum + ra_g;
                    if den_sum.abs() >= ra_g.abs() {
                        den_comp += (den_sum - t) + ra_g;
                    } else {
                        den_comp += (ra_g - t) + den_sum;
                    }
                    den_sum = t;
                    wsum += weight;
                }
                let step_in = (num_sum + num_comp) / (den_sum + den_comp);
                let step_out = (total_weight - wsum) / ((p - k) as f64);
                step_in - step_out
            })
            .collect()
    });

    let mut scores = vec![0.0f64; nsets * nsamp];
    for (j, col) in columns.iter().enumerate() {
        for (s, &v) in col.iter().enumerate() {
            scores[s * nsamp + j] = v;
        }
    }

    if params.normalize {
        // Global normalization across the entire score matrix (Barbie 2009).
        let mut mn = f64::INFINITY;
        let mut mx = f64::NEG_INFINITY;
        for &v in &scores {
            if v < mn {
                mn = v;
            }
            if v > mx {
                mx = v;
            }
        }
        let denom = mx - mn;
        for v in &mut scores {
            *v /= denom;
        }
    }

    EnrichmentResult {
        gene_sets: names,
        samples: expr.col_names().to_vec(),
        scores,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geneset::{GeneSet, GeneSets};

    #[test]
    fn small_ssgsea_shapes() {
        // 4 genes x 3 samples, one gene set of 2 genes.
        let expr = ExprMatrix::new(
            vec!["G1".into(), "G2".into(), "G3".into(), "G4".into()],
            vec!["S1".into(), "S2".into(), "S3".into()],
            vec![
                1.0, 2.0, 3.0, // G1
                4.0, 6.0, 8.0, // G2
                3.0, 2.0, 1.0, // G3
                10.0, 5.0, 1.0, // G4
            ],
        );
        let gs = GeneSets::new(vec![GeneSet::new("SET", vec!["G1".into(), "G2".into()])]);
        let res = ssgsea(&expr, &gs, &SsgseaParams::default());
        assert_eq!(res.gene_sets, ["SET"]);
        assert_eq!(res.samples, ["S1", "S2", "S3"]);
        assert_eq!(res.scores.len(), 3);
    }
}