Skip to main content

mcpcounter_rust/
mcpcounter.rs

1//! MCP-counter (Becht et al. 2016): marker-gene abundance estimation.
2//!
3//! Port of `MCPcounter.estimate` from the GPL-3 MCPcounter R package
4//! (<https://github.com/ebecht/MCPcounter>). For each cell population, the score
5//! in a sample is the arithmetic mean (R two-pass, `na.rm`) of that population's
6//! marker features that are present in the expression matrix. No internal
7//! transform or normalization is applied, matching the R implementation.
8//!
9//! The bundled marker signatures (`signatures/mcpcounter_*.txt`) are the
10//! upstream CC0 / public-domain data files.
11
12use std::collections::{HashMap, HashSet};
13
14use crate::matrix::ExprMatrix;
15use crate::result::DeconvResult;
16use crate::stats::r_mean_narm;
17
18const GENES_TSV: &str = include_str!("signatures/mcpcounter_genes.txt");
19const PROBESETS_TSV: &str = include_str!("signatures/mcpcounter_probesets.txt");
20
21/// Feature identifier namespace used to match markers against the expression
22/// matrix row names. Mirrors `featuresType` in the R package.
23#[derive(Clone, Copy, Debug, PartialEq, Eq)]
24pub enum FeaturesType {
25    /// Affymetrix HG 133 Plus 2.0 probesets (the R default).
26    Affy133P2Probesets,
27    /// Official HUGO gene symbols.
28    HugoSymbols,
29    /// Entrez gene IDs.
30    EntrezId,
31    /// Ensembl gene IDs.
32    EnsemblId,
33}
34
35/// Strip a single pair of surrounding double quotes (R `write.table` style).
36fn unquote(s: &str) -> &str {
37    let s = s.trim();
38    s.strip_prefix('"')
39        .and_then(|x| x.strip_suffix('"'))
40        .unwrap_or(s)
41}
42
43/// `(marker id, population)` pairs for the requested feature namespace, in the
44/// row order of the signature file.
45fn signature_pairs(ft: FeaturesType) -> Vec<(String, String)> {
46    if ft == FeaturesType::Affy133P2Probesets {
47        // Headerless TSV: probeset, population.
48        return PROBESETS_TSV
49            .lines()
50            .filter(|l| !l.trim().is_empty())
51            .map(|l| {
52                let mut it = l.split('\t');
53                let id = unquote(it.next().unwrap_or(""));
54                let pop = unquote(it.next().unwrap_or(""));
55                (id.to_string(), pop.to_string())
56            })
57            .collect();
58    }
59
60    // genes.txt header: "HUGO symbols"\t"Cell population"\t"ENTREZID"\t"ENSEMBL ID"
61    let col = match ft {
62        FeaturesType::HugoSymbols => 0,
63        FeaturesType::EntrezId => 2,
64        FeaturesType::EnsemblId => 3,
65        FeaturesType::Affy133P2Probesets => unreachable!(),
66    };
67    GENES_TSV
68        .lines()
69        .skip(1) // header
70        .filter(|l| !l.trim().is_empty())
71        .filter_map(|l| {
72            let fields: Vec<&str> = l.split('\t').collect();
73            let id = unquote(fields.get(col).copied().unwrap_or(""));
74            let pop = unquote(fields.get(1).copied().unwrap_or(""));
75            // Unquoted `NA` (e.g. missing Ensembl ID) is never a valid feature.
76            if id.is_empty() || id == "NA" {
77                None
78            } else {
79                Some((id.to_string(), pop.to_string()))
80            }
81        })
82        .collect()
83}
84
85/// Estimate cell-population abundances with MCP-counter.
86///
87/// Returns populations (rows) by samples (columns). A population is emitted only
88/// if at least one of its markers is present in `expr`; populations appear in
89/// their first-appearance order in the signature table.
90pub fn estimate(expr: &ExprMatrix, features_type: FeaturesType) -> DeconvResult {
91    let pairs = signature_pairs(features_type);
92
93    // Population output order = first appearance in the signature table.
94    let mut pop_order: Vec<&str> = Vec::new();
95    let mut seen: HashSet<&str> = HashSet::new();
96    for (_, pop) in &pairs {
97        if seen.insert(pop.as_str()) {
98            pop_order.push(pop.as_str());
99        }
100    }
101
102    // Resolve each population's marker rows by direct lookup in the matrix's row
103    // index (O(markers)), rather than scanning every row name once per
104    // population (the old O(genes x populations) cost). `pairs` is the signature
105    // in table order; `row_of` returns the first matrix row for a name.
106    let mut pop_rows: HashMap<&str, Vec<usize>> = HashMap::new();
107    for (id, pop) in &pairs {
108        if let Some(row) = expr.row_of(id) {
109            pop_rows.entry(pop.as_str()).or_default().push(row);
110        }
111    }
112    // Sort to ascending row position and de-duplicate to one row per marker: this
113    // reproduces R's `expr[intersect(rownames(expr), markers), ]` (row-name
114    // order, first row per name). The order matters — the two-pass mean is
115    // summation-order sensitive at the last ULP — so it must match R to stay
116    // bit-identical.
117    for rows in pop_rows.values_mut() {
118        rows.sort_unstable();
119        rows.dedup();
120    }
121
122    let ncol = expr.ncol();
123    let mut populations: Vec<String> = Vec::new();
124    let mut scores: Vec<f64> = Vec::new();
125    let mut col_vals: Vec<f64> = Vec::new();
126
127    for pop in &pop_order {
128        let Some(rows) = pop_rows.get(pop) else {
129            continue;
130        };
131        if rows.is_empty() {
132            continue;
133        }
134
135        populations.push((*pop).to_string());
136        for j in 0..ncol {
137            col_vals.clear();
138            for &i in rows {
139                col_vals.push(expr.get(i, j));
140            }
141            scores.push(r_mean_narm(&col_vals));
142        }
143    }
144
145    DeconvResult {
146        populations,
147        samples: expr.col_names().to_vec(),
148        scores,
149    }
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    fn approx(a: f64, b: f64) {
157        assert!((a - b).abs() < 1e-12, "{a} vs {b}");
158    }
159
160    #[test]
161    fn hugo_symbols_marker_means() {
162        // CD3D/CD3G/CD28 are "T cells" markers; CD8B is a "CD8 T cells" marker;
163        // GAPDH is not a marker. (See signatures/mcpcounter_genes.txt.)
164        let rows = vec![
165            "CD3D".to_string(),
166            "CD3G".to_string(),
167            "CD28".to_string(),
168            "CD8B".to_string(),
169            "GAPDH".to_string(),
170        ];
171        let cols = vec!["s1".to_string(), "s2".to_string()];
172        #[rustfmt::skip]
173        let data = vec![
174            10.0, 20.0, // CD3D
175            12.0, 18.0, // CD3G
176            14.0, 22.0, // CD28
177            5.0,  7.0,  // CD8B
178            100.0, 200.0, // GAPDH (ignored)
179        ];
180        let m = ExprMatrix::new(rows, cols, data);
181        let res = estimate(&m, FeaturesType::HugoSymbols);
182
183        // genes.txt order => "T cells" before "CD8 T cells".
184        assert_eq!(res.populations, vec!["T cells", "CD8 T cells"]);
185        assert_eq!(res.samples, vec!["s1", "s2"]);
186        // T cells: mean(10,12,14)=12 ; mean(20,18,22)=20
187        approx(res.score(0, 0), 12.0);
188        approx(res.score(0, 1), 20.0);
189        // CD8 T cells: single marker
190        approx(res.score(1, 0), 5.0);
191        approx(res.score(1, 1), 7.0);
192    }
193
194    #[test]
195    fn probesets_default_smoke() {
196        // Two "T cells" probesets from signatures/mcpcounter_probesets.txt.
197        let rows = vec!["204777_s_at".to_string(), "206485_at".to_string()];
198        let cols = vec!["a".to_string()];
199        let data = vec![10.0, 30.0];
200        let m = ExprMatrix::new(rows, cols, data);
201        let res = estimate(&m, FeaturesType::Affy133P2Probesets);
202        assert_eq!(res.populations, vec!["T cells"]);
203        approx(res.score(0, 0), 20.0);
204    }
205
206    #[test]
207    fn absent_markers_drop_population() {
208        // No known markers => no populations.
209        let rows = vec!["FOO".to_string(), "BAR".to_string()];
210        let cols = vec!["s1".to_string()];
211        let data = vec![1.0, 2.0];
212        let m = ExprMatrix::new(rows, cols, data);
213        let res = estimate(&m, FeaturesType::HugoSymbols);
214        assert!(res.populations.is_empty());
215        assert!(res.scores.is_empty());
216    }
217}