gsva-rust 0.1.0

//! GSVA enrichment scores (Hänzelmann *et al.* 2013), the namesake method of the
//! Bioconductor `GSVA` package.
//!
//! Pipeline, faithful to GSVA 2.6.2's `gsvaParam` path:
//!
//! 1. Drop constant rows (`removeConstant = TRUE`), then map gene sets onto the
//!    surviving features and size-filter them.
//! 2. **kcdf** — per gene, a kernel estimate of its expression CDF evaluated at
//!    every sample (Gaussian for continuous data; see [`crate::kcdf`]). GSVA
//!    takes the logit of this; we rank the CDF directly (rank-equivalent).
//! 3. Per sample, rank genes by their kcdf value (`ties.method = "last"`) and
//!    derive the decreasing-order statistic `dos = p − rank + 1` and the
//!    symmetric rank statistic `srs = |p/2 − rank|`.
//! 4. For each gene set, a Kolmogorov–Smirnov-style random walk over the ranking
//!    yields the largest positive and negative deviations; the score combines
//!    them (`maxDiff`/`absRanking`).
//!
//! The walk arithmetic mirrors the package's C scorer (`src/ks_test.c`,
//! `gsva_rnd_walk`) operation-for-operation — plain `f64` running sums for the
//! in-set CDF, exact integer running counts for the out-of-set CDF — so scores
//! match Bioconductor GSVA to `f64` rounding.

use crate::geneset::GeneSets;
use crate::kcdf;
use crate::matrix::ExprMatrix;
use crate::par;
use crate::prep;
use crate::rank;
use crate::result::EnrichmentResult;

/// Kernel used to estimate each gene's expression CDF (`kcdf` in GSVA).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Kcdf {
    /// Gaussian kernel, for continuous data (microarray, log-CPM). GSVA's
    /// `kcdf = "Gaussian"`, and what `kcdf = "auto"` selects for non-integer
    /// input.
    Gaussian,
    /// Poisson kernel, for integer count data. GSVA's `kcdf = "Poisson"`.
    Poisson,
}

/// Parameters for GSVA. Defaults mirror GSVA `gsvaParam`: `min_size = 1`,
/// `max_size = unbounded`, `tau = 1`, `max_diff = true`, `abs_ranking = false`,
/// `kcdf = Gaussian`.
#[derive(Clone, Debug)]
pub struct GsvaParams {
    /// Minimum number of matched genes for a set to be scored.
    pub min_size: usize,
    /// Maximum number of matched genes for a set to be scored.
    pub max_size: usize,
    /// Exponent weighting the random-walk steps by the symmetric rank statistic.
    /// GSVA default `1.0`.
    pub tau: f64,
    /// `true` (default): score is the sum of the largest positive and negative
    /// walk deviations. `false`: take whichever deviation is larger in
    /// magnitude (the classic enrichment-statistic choice).
    pub max_diff: bool,
    /// Only meaningful with `max_diff = true`: subtract the negative deviation
    /// instead of adding it (`|pos| + |neg|`). GSVA default `false`.
    pub abs_ranking: bool,
    /// Kernel for the per-gene CDF estimate.
    pub kcdf: Kcdf,
}

impl Default for GsvaParams {
    fn default() -> Self {
        GsvaParams {
            min_size: 1,
            max_size: usize::MAX,
            tau: 1.0,
            max_diff: true,
            abs_ranking: false,
            kcdf: Kcdf::Gaussian,
        }
    }
}

/// Compute GSVA enrichment scores.
///
/// Mirrors `gsva(gsvaParam(expr, geneSets, kcdf, tau, maxDiff, absRanking,
/// minSize, maxSize))`. Constant rows are dropped first (`removeConstant =
/// TRUE`), so `n` is the number of non-constant genes.
///
/// Panics on the conditions R aborts on (fewer than two rows after dropping
/// constant rows, no genes matched, no set left, or duplicated surviving set
/// names).
pub fn gsva(expr: &ExprMatrix, gene_sets: &GeneSets, params: &GsvaParams) -> EnrichmentResult {
    // 1. removeConstant = TRUE, then map/size-filter gene sets onto survivors.
    let filtered = prep::filter_constant_rows(expr);
    let (names, indices) =
        prep::map_and_filter_sets(&filtered, gene_sets, params.min_size, params.max_size);

    let p = filtered.nrow();
    let nsamp = filtered.ncol();

    // 2. kcdf: per-gene CDF estimate at each sample (rank-equivalent to GSVA's
    //    logit output).
    let kcdf_vals = match params.kcdf {
        Kcdf::Gaussian => kcdf::gaussian_left_tail(&filtered),
        Kcdf::Poisson => kcdf::poisson_left_tail(&filtered),
    };

    let half_p = p as f64 / 2.0;
    let nsets = names.len();

    // Each sample is independent: rank its kcdf column once (ties.method =
    // "last"), derive dos/srs, then walk every gene set. Parallelize over
    // samples — each produces one score column (gene sets for that sample), in
    // sample order — then scatter into the row-major matrix. The dos/srs scratch
    // is per-sample, so nothing is shared mutably and the output is bit-identical
    // to the serial build.
    let columns: Vec<Vec<f64>> = par::map_collect(nsamp, |j| {
        // 3. Per-sample ranks of the kcdf column, then dos / srs (ranks2stats).
        let col: Vec<f64> = (0..p).map(|g| kcdf_vals[g * nsamp + j]).collect();
        let ranks = rank::rank_last(&col);
        let mut dos = vec![0usize; p];
        let mut srs = vec![0.0f64; p];
        for g in 0..p {
            let r = ranks[g];
            dos[g] = p - r + 1;
            srs[g] = (half_p - r as f64).abs();
        }
        // 4. Random walk per gene set.
        indices
            .iter()
            .map(|members| walk_score(members, &dos, &srs, p, params))
            .collect()
    });

    let mut scores = vec![0.0f64; nsets * nsamp];
    for (j, col) in columns.iter().enumerate() {
        for (s, &v) in col.iter().enumerate() {
            scores[s * nsamp + j] = v;
        }
    }

    EnrichmentResult {
        gene_sets: names,
        samples: expr.col_names().to_vec(),
        scores,
    }
}

/// One gene set's GSVA statistic for one sample, reproducing C `gsva_rnd_walk`
/// followed by the score combination in `gsva_score_genesets_R`.
fn walk_score(members: &[usize], dos: &[usize], srs: &[f64], n: usize, params: &GsvaParams) -> f64 {
    // In-set step CDF: srs at each member's decreasing-order position (plain
    // f64, matching the C running sum). Out-of-set step CDF: 1 at every
    // non-member position (exact integer running count).
    let mut step_in = vec![0.0f64; n];
    let mut step_out = vec![1i64; n];
    let unit_tau = params.tau == 1.0;
    for &g in members {
        let pos = dos[g] - 1; // dos is 1-based; convert to 0-based index.
        step_in[pos] = if unit_tau {
            srs[g]
        } else {
            srs[g].powf(params.tau)
        };
        step_out[pos] = 0;
    }
    for i in 1..n {
        step_in[i] += step_in[i - 1];
        step_out[i] += step_out[i - 1];
    }

    let in_last = step_in[n - 1];
    let out_last = step_out[n - 1];
    // R returns NA when either arm never accumulates; propagate as NaN.
    if !(in_last > 0.0 && out_last > 0) {
        return f64::NAN;
    }
    let out_last_f = out_last as f64;

    // Largest positive and (most) negative deviations, each clamped against 0
    // by the running-max/min initialised at 0 (exactly the C loop).
    let mut walk_pos = 0.0f64;
    let mut walk_neg = 0.0f64;
    for i in 0..n {
        let w = step_in[i] / in_last - step_out[i] as f64 / out_last_f;
        if w > walk_pos {
            walk_pos = w;
        }
        if w < walk_neg {
            walk_neg = w;
        }
    }

    if params.max_diff {
        if params.abs_ranking {
            walk_pos - walk_neg
        } else {
            walk_pos + walk_neg
        }
    } else if walk_pos > walk_neg.abs() {
        walk_pos
    } else {
        walk_neg
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geneset::{GeneSet, GeneSets};

    #[test]
    fn small_gsva_shapes() {
        let expr = ExprMatrix::new(
            vec!["G1".into(), "G2".into(), "G3".into(), "G4".into()],
            vec!["S1".into(), "S2".into(), "S3".into()],
            vec![
                1.0, 2.0, 3.0, // G1
                4.0, 6.0, 8.0, // G2
                3.0, 2.0, 1.0, // G3
                10.0, 5.0, 2.0, // G4
            ],
        );
        let gs = GeneSets::new(vec![GeneSet::new(
            "SET",
            vec!["G1".into(), "G2".into(), "G3".into()],
        )]);
        let res = gsva(&expr, &gs, &GsvaParams::default());
        assert_eq!(res.gene_sets, ["SET"]);
        assert_eq!(res.samples, ["S1", "S2", "S3"]);
        assert_eq!(res.scores.len(), 3);
        assert!(res.scores.iter().all(|v| v.is_finite()));
    }
}