xcell-rust 0.1.0

Pure-Rust port of xCell (Aran et al. 2017) cell-type enrichment — ssGSEA, spillover-corrected — validated for numeric parity against the R xCell package. Built on gsva-rust.
Documentation
//! Embedded xCell model data (xCell 1.1.0): the signatures, the scoring-gene
//! universe, the spillover matrix `K`, and the calibration table `fv`, for both
//! the RNA-seq and the microarray spill objects. All of it is exported from the
//! GPL-3 xCell package by `benchmarks/export_xcell_data.R`; xcell-rust is itself
//! GPL-3.0-or-later, so bundling it is license-clean.

use gsva::io::read_tsv_matrix;
use gsva::{ExprMatrix, GeneSets};

const SIGNATURES_GMT: &str = include_str!("data/signatures.gmt");
const GENES_TXT: &str = include_str!("data/genes.txt");
const CELLTYPES_TXT: &str = include_str!("data/celltypes.txt");
const SPILL_K_TSV: &str = include_str!("data/spill_K.tsv");
const SPILL_FV_TSV: &str = include_str!("data/spill_fv.tsv");
const ARRAY_K_TSV: &str = include_str!("data/array_K.tsv");
const ARRAY_FV_TSV: &str = include_str!("data/array_fv.tsv");

/// One xCell "spill" object: the spillover matrix `k` (cell types × cell types)
/// and the calibration table `fv` (cell types × `[V1, V2, V3]`). Both are
/// indexed by cell-type name, mirroring R's by-name matrix indexing.
pub struct SpillModel {
    /// Spillover matrix, square, rows and columns labelled by cell type in
    /// xCell's canonical order.
    pub k: ExprMatrix,
    /// Calibration table; columns are `V1`, `V2`, `V3`. `transformScores` uses
    /// only `V2` (the exponent) and `V3` (the scale divisor); `V1` is unused.
    pub fv: ExprMatrix,
}

impl SpillModel {
    fn parse(k_tsv: &str, fv_tsv: &str) -> Self {
        let k = read_tsv_matrix(k_tsv);
        let fv = read_tsv_matrix(fv_tsv);
        assert_eq!(k.nrow(), k.ncol(), "spill K must be square");
        assert_eq!(fv.ncol(), 3, "spill fv must have columns V1, V2, V3");
        SpillModel { k, fv }
    }

    /// `V2` (the transform exponent) for a cell type, if present in `fv`.
    pub fn v2(&self, cell_type: &str) -> Option<f64> {
        self.fv.row_of(cell_type).map(|r| self.fv.get(r, 1))
    }

    /// `V3` (the transform scale divisor) for a cell type, if present in `fv`.
    pub fn v3(&self, cell_type: &str) -> Option<f64> {
        self.fv.row_of(cell_type).map(|r| self.fv.get(r, 2))
    }
}

/// The full xCell model: signatures, gene universe, canonical cell-type order,
/// and both spill objects (RNA-seq and microarray).
pub struct XCellModel {
    /// The 489 xCell signatures, named `cellType%source%idx.txt`.
    pub signatures: GeneSets,
    /// The 10 808-symbol scoring universe (`xCell.data$genes`).
    pub genes: Vec<String>,
    /// 64 cell types in xCell's canonical (R `aggregate()` / sorted) output
    /// order — the row order of `rawEnrichmentAnalysis`.
    pub cell_types: Vec<String>,
    /// Spill object used when `rnaseq = true` (`xCell.data$spill`).
    pub spill: SpillModel,
    /// Spill object used when `rnaseq = false` (`xCell.data$spill.array`).
    pub spill_array: SpillModel,
}

impl XCellModel {
    /// Parse the embedded xCell data. Cheap enough to call once per analysis.
    pub fn load() -> Self {
        let signatures = GeneSets::from_gmt(SIGNATURES_GMT);
        let genes = lines(GENES_TXT);
        let cell_types = lines(CELLTYPES_TXT);
        assert_eq!(signatures.len(), 489, "expected 489 xCell signatures");
        assert_eq!(genes.len(), 10808, "expected 10808 universe genes");
        assert_eq!(cell_types.len(), 64, "expected 64 cell types");
        XCellModel {
            signatures,
            genes,
            cell_types,
            spill: SpillModel::parse(SPILL_K_TSV, SPILL_FV_TSV),
            spill_array: SpillModel::parse(ARRAY_K_TSV, ARRAY_FV_TSV),
        }
    }

    /// The spill object for the chosen platform (`rnaseq = true` → RNA-seq).
    pub fn spill_for(&self, rnaseq: bool) -> &SpillModel {
        if rnaseq {
            &self.spill
        } else {
            &self.spill_array
        }
    }
}

fn lines(text: &str) -> Vec<String> {
    text.lines()
        .map(str::trim)
        .filter(|l| !l.is_empty())
        .map(str::to_string)
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn model_loads_with_expected_shapes() {
        let m = XCellModel::load();
        assert_eq!(m.signatures.len(), 489);
        assert_eq!(m.genes.len(), 10808);
        assert_eq!(m.cell_types.len(), 64);
        assert_eq!(m.spill.k.nrow(), 64);
        assert_eq!(m.spill.k.ncol(), 64);
        assert_eq!(m.spill_array.k.nrow(), 64);
        // canonical order starts with the lowercase-initial "aDC".
        assert_eq!(m.cell_types[0], "aDC");
        // every signature's cell type is one of the 64.
        for s in m.signatures.iter() {
            let ct = s.name.split('%').next().unwrap();
            assert!(
                m.cell_types.iter().any(|c| c == ct),
                "unknown cell type {ct}"
            );
        }
        // fv lookups resolve for a known cell type.
        assert!(m.spill.v2("B-cells").is_some());
        assert!(m.spill.v3("B-cells").is_some());
    }
}