xcell-rust 0.1.0

Pure-Rust port of xCell (Aran et al. 2017) cell-type enrichment — ssGSEA, spillover-corrected — validated for numeric parity against the R xCell package. Built on gsva-rust.
Documentation
//! Stage 1 of xCell: `rawEnrichmentAnalysis`.
//!
//! Mirrors xCell 1.1.0 operation-for-operation: keep the genes shared with the
//! scoring universe (at least [`MIN_GENES`]), replace each sample column by its
//! average-tie ranks, run **unnormalized ssGSEA** over the 489 signatures,
//! subtract each signature's per-row minimum, then average the signatures of
//! each cell type. The result is the 64 cell types × samples in xCell's
//! canonical order.

use std::collections::{HashMap, HashSet};

use gsva::{EnrichmentResult, ExprMatrix, SsgseaParams};

use crate::data::XCellModel;

/// xCell aborts (`return -1`) when fewer than this many genes are shared with
/// its scoring universe. We fail loudly instead of emitting a degenerate result.
pub const MIN_GENES: usize = 5000;

/// Compute the raw (pre-spillover) per-cell-type enrichment scores.
pub fn raw_enrichment(expr: &ExprMatrix, model: &XCellModel) -> EnrichmentResult {
    let universe: HashSet<&str> = model.genes.iter().map(String::as_str).collect();

    // intersect(rownames(expr), genes): expr rows that are in the universe, in
    // expr row order, de-duplicated (R `intersect` returns unique values).
    let mut shared_rows = Vec::new();
    let mut shared_names = Vec::new();
    let mut seen: HashSet<&str> = HashSet::new();
    for (i, name) in expr.row_names().iter().enumerate() {
        let s = name.as_str();
        if universe.contains(s) && seen.insert(s) {
            shared_rows.push(i);
            shared_names.push(name.clone());
        }
    }
    assert!(
        shared_names.len() >= MIN_GENES,
        "xCell needs at least {MIN_GENES} genes shared with its universe, found {}",
        shared_names.len()
    );

    // expr <- apply(expr, 2, rank): per-sample average-tie ranks over the shared
    // sub-matrix. This ranked matrix is what xCell hands to `ssgseaParam`.
    let nsamp = expr.ncol();
    let p = shared_rows.len();
    let mut ranked = vec![0.0f64; p * nsamp];
    for j in 0..nsamp {
        let col: Vec<f64> = shared_rows.iter().map(|&i| expr.get(i, j)).collect();
        let r = gsva::rank::rank_average(&col);
        for (i, &rv) in r.iter().enumerate() {
            ranked[i * nsamp + j] = rv;
        }
    }
    let ranked = ExprMatrix::new(shared_names, expr.col_names().to_vec(), ranked);

    // GSVA::gsva(ssgseaParam(expr, signatures, normalize = FALSE))
    let params = SsgseaParams {
        normalize: false,
        ..SsgseaParams::default()
    };
    let ss = gsva::ssgsea(&ranked, &model.signatures, &params);
    let nsig = ss.gene_sets.len();

    // scores <- scores - apply(scores, 1, min): subtract each signature's
    // minimum across samples (exact subtraction, no rounding).
    let mut sc = ss.scores;
    for s in 0..nsig {
        let mn = sc[s * nsamp..(s + 1) * nsamp]
            .iter()
            .copied()
            .fold(f64::INFINITY, f64::min);
        for v in &mut sc[s * nsamp..(s + 1) * nsamp] {
            *v -= mn;
        }
    }

    // aggregate(scores ~ cell_types, FUN = mean): group surviving signatures by
    // their cell type (field 1 of `cellType%source%idx`) and average, emitting
    // cell types in xCell's canonical order.
    let mut groups: HashMap<&str, Vec<usize>> = HashMap::new();
    for (s, name) in ss.gene_sets.iter().enumerate() {
        let ct = name.split('%').next().unwrap_or(name.as_str());
        groups.entry(ct).or_default().push(s);
    }

    let mut out_types = Vec::new();
    let mut scores = Vec::new();
    for ct in &model.cell_types {
        let Some(idxs) = groups.get(ct.as_str()) else {
            continue; // a cell type whose signatures were all filtered out
        };
        out_types.push(ct.clone());
        let k = idxs.len() as f64;
        for j in 0..nsamp {
            let sum: f64 = idxs.iter().map(|&s| sc[s * nsamp + j]).sum();
            scores.push(sum / k);
        }
    }

    EnrichmentResult {
        gene_sets: out_types,
        samples: expr.col_names().to_vec(),
        scores,
    }
}