Skip to main content

xcell/
data.rs

1//! Embedded xCell model data (xCell 1.1.0): the signatures, the scoring-gene
2//! universe, the spillover matrix `K`, and the calibration table `fv`, for both
3//! the RNA-seq and the microarray spill objects. All of it is exported from the
4//! GPL-3 xCell package by `benchmarks/export_xcell_data.R`; xcell-rust is itself
5//! GPL-3.0-or-later, so bundling it is license-clean.
6
7use gsva::io::read_tsv_matrix;
8use gsva::{ExprMatrix, GeneSets};
9
10const SIGNATURES_GMT: &str = include_str!("data/signatures.gmt");
11const GENES_TXT: &str = include_str!("data/genes.txt");
12const CELLTYPES_TXT: &str = include_str!("data/celltypes.txt");
13const SPILL_K_TSV: &str = include_str!("data/spill_K.tsv");
14const SPILL_FV_TSV: &str = include_str!("data/spill_fv.tsv");
15const ARRAY_K_TSV: &str = include_str!("data/array_K.tsv");
16const ARRAY_FV_TSV: &str = include_str!("data/array_fv.tsv");
17
18/// One xCell "spill" object: the spillover matrix `k` (cell types × cell types)
19/// and the calibration table `fv` (cell types × `[V1, V2, V3]`). Both are
20/// indexed by cell-type name, mirroring R's by-name matrix indexing.
21pub struct SpillModel {
22    /// Spillover matrix, square, rows and columns labelled by cell type in
23    /// xCell's canonical order.
24    pub k: ExprMatrix,
25    /// Calibration table; columns are `V1`, `V2`, `V3`. `transformScores` uses
26    /// only `V2` (the exponent) and `V3` (the scale divisor); `V1` is unused.
27    pub fv: ExprMatrix,
28}
29
30impl SpillModel {
31    fn parse(k_tsv: &str, fv_tsv: &str) -> Self {
32        let k = read_tsv_matrix(k_tsv);
33        let fv = read_tsv_matrix(fv_tsv);
34        assert_eq!(k.nrow(), k.ncol(), "spill K must be square");
35        assert_eq!(fv.ncol(), 3, "spill fv must have columns V1, V2, V3");
36        SpillModel { k, fv }
37    }
38
39    /// `V2` (the transform exponent) for a cell type, if present in `fv`.
40    pub fn v2(&self, cell_type: &str) -> Option<f64> {
41        self.fv.row_of(cell_type).map(|r| self.fv.get(r, 1))
42    }
43
44    /// `V3` (the transform scale divisor) for a cell type, if present in `fv`.
45    pub fn v3(&self, cell_type: &str) -> Option<f64> {
46        self.fv.row_of(cell_type).map(|r| self.fv.get(r, 2))
47    }
48}
49
50/// The full xCell model: signatures, gene universe, canonical cell-type order,
51/// and both spill objects (RNA-seq and microarray).
52pub struct XCellModel {
53    /// The 489 xCell signatures, named `cellType%source%idx.txt`.
54    pub signatures: GeneSets,
55    /// The 10 808-symbol scoring universe (`xCell.data$genes`).
56    pub genes: Vec<String>,
57    /// 64 cell types in xCell's canonical (R `aggregate()` / sorted) output
58    /// order — the row order of `rawEnrichmentAnalysis`.
59    pub cell_types: Vec<String>,
60    /// Spill object used when `rnaseq = true` (`xCell.data$spill`).
61    pub spill: SpillModel,
62    /// Spill object used when `rnaseq = false` (`xCell.data$spill.array`).
63    pub spill_array: SpillModel,
64}
65
66impl XCellModel {
67    /// Parse the embedded xCell data. Cheap enough to call once per analysis.
68    pub fn load() -> Self {
69        let signatures = GeneSets::from_gmt(SIGNATURES_GMT);
70        let genes = lines(GENES_TXT);
71        let cell_types = lines(CELLTYPES_TXT);
72        assert_eq!(signatures.len(), 489, "expected 489 xCell signatures");
73        assert_eq!(genes.len(), 10808, "expected 10808 universe genes");
74        assert_eq!(cell_types.len(), 64, "expected 64 cell types");
75        XCellModel {
76            signatures,
77            genes,
78            cell_types,
79            spill: SpillModel::parse(SPILL_K_TSV, SPILL_FV_TSV),
80            spill_array: SpillModel::parse(ARRAY_K_TSV, ARRAY_FV_TSV),
81        }
82    }
83
84    /// The spill object for the chosen platform (`rnaseq = true` → RNA-seq).
85    pub fn spill_for(&self, rnaseq: bool) -> &SpillModel {
86        if rnaseq {
87            &self.spill
88        } else {
89            &self.spill_array
90        }
91    }
92}
93
94fn lines(text: &str) -> Vec<String> {
95    text.lines()
96        .map(str::trim)
97        .filter(|l| !l.is_empty())
98        .map(str::to_string)
99        .collect()
100}
101
102#[cfg(test)]
103mod tests {
104    use super::*;
105
106    #[test]
107    fn model_loads_with_expected_shapes() {
108        let m = XCellModel::load();
109        assert_eq!(m.signatures.len(), 489);
110        assert_eq!(m.genes.len(), 10808);
111        assert_eq!(m.cell_types.len(), 64);
112        assert_eq!(m.spill.k.nrow(), 64);
113        assert_eq!(m.spill.k.ncol(), 64);
114        assert_eq!(m.spill_array.k.nrow(), 64);
115        // canonical order starts with the lowercase-initial "aDC".
116        assert_eq!(m.cell_types[0], "aDC");
117        // every signature's cell type is one of the 64.
118        for s in m.signatures.iter() {
119            let ct = s.name.split('%').next().unwrap();
120            assert!(
121                m.cell_types.iter().any(|c| c == ct),
122                "unknown cell type {ct}"
123            );
124        }
125        // fv lookups resolve for a known cell type.
126        assert!(m.spill.v2("B-cells").is_some());
127        assert!(m.spill.v3("B-cells").is_some());
128    }
129}