mcpcounter-rust 0.1.0

Pure-Rust port of the MCPcounter cell-population quantification methods: MCP-counter (human) and mMCP-counter (mouse), validated for numeric parity against the original R implementations.
Documentation
//! mMCP-counter (Petitprez et al. 2020): murine marker-gene abundance.
//!
//! Port of `mMCPcounter.estimate` from the GPL-3 mMCP-counter R package
//! (<https://github.com/cit-bioinfo/mMCP-counter>). For each cell population the
//! score in a sample is the **median** (R `median`, `na.rm = TRUE`) over that
//! population's marker features present in the expression matrix. No internal
//! transform or normalization is applied, matching the R implementation.
//!
//! Two differences from MCP-counter are reproduced faithfully:
//!
//! * **Aggregation is the median, not the mean.** R averages the two central
//!   order statistics with a two-pass `mean()` for even counts (see
//!   [`crate::stats::r_median_narm`]).
//! * **Marker multiplicity is preserved.** R selects expression rows with
//!   `exp[localSig[, features], ]`, so a feature listed *k* times in the
//!   signature contributes *k* copies to the median. (MCP-counter instead
//!   de-duplicates via `intersect`.) For example `Ccl19` is listed three times
//!   under "Fibroblasts" and counts three times here.
//!
//! The bundled signatures (`signatures/mmcpcounter_signatures_GCRm3{8,9}.txt`)
//! are exported verbatim from the package's binary `.RData` tables and inherit
//! its GPL-3 license.

use std::collections::HashMap;

use crate::matrix::ExprMatrix;
use crate::result::DeconvResult;
use crate::stats::r_median_narm;

const SIG_GCRM38: &str = include_str!("signatures/mmcpcounter_signatures_GCRm38.txt");
const SIG_GCRM39: &str = include_str!("signatures/mmcpcounter_signatures_GCRm39.txt");

/// Murine genome assembly selecting which bundled signature table to use.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum GenomeVersion {
    /// GRCm38 / mm10.
    GCRm38,
    /// GRCm39 / mm39 (the R default).
    GCRm39,
}

/// Feature identifier namespace used to match markers against expression rows.
/// Mirrors the `features` argument of `mMCPcounter.estimate`.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum MurineFeaturesType {
    /// Murine gene symbols (the R default).
    GeneSymbol,
    /// Ensembl gene IDs.
    EnsemblId,
    /// Microarray probe IDs.
    Probes,
}

/// Fixed mMCP-counter output population order. R reorders the aggregated table
/// to exactly this list, then drops populations for which no markers were
/// found (their rows are entirely `NA`).
const POPULATION_ORDER: [&str; 16] = [
    "T cells",
    "CD8 T cells",
    "NK cells",
    "B derived",
    "Memory B cells",
    "Monocytes / macrophages",
    "Monocytes",
    "Granulocytes",
    "Mast cells",
    "Eosinophils",
    "Neutrophils",
    "Basophils",
    "Vessels",
    "Lymphatics",
    "Endothelial cells",
    "Fibroblasts",
];

/// Strip a single pair of surrounding double quotes (R `write.table` style).
fn unquote(s: &str) -> &str {
    let s = s.trim();
    s.strip_prefix('"')
        .and_then(|x| x.strip_suffix('"'))
        .unwrap_or(s)
}

/// `(marker id, population)` pairs for the requested genome + feature namespace,
/// in signature-table row order, preserving duplicates.
fn signature_pairs(genome: GenomeVersion, features: MurineFeaturesType) -> Vec<(String, String)> {
    let tsv = match genome {
        GenomeVersion::GCRm38 => SIG_GCRM38,
        GenomeVersion::GCRm39 => SIG_GCRM39,
    };
    // Header columns: "Probes" | "Denomination" | "Gene.Symbol" | "ENSEMBL.ID".
    let feat_col = match features {
        MurineFeaturesType::Probes => 0,
        MurineFeaturesType::GeneSymbol => 2,
        MurineFeaturesType::EnsemblId => 3,
    };
    const DENOM_COL: usize = 1;

    tsv.lines()
        .skip(1) // header
        .filter(|l| !l.trim().is_empty())
        .filter_map(|l| {
            let fields: Vec<&str> = l.split('\t').collect();
            let id = unquote(fields.get(feat_col).copied().unwrap_or(""));
            let pop = unquote(fields.get(DENOM_COL).copied().unwrap_or(""));
            if id.is_empty() || id == "NA" {
                None
            } else {
                Some((id.to_string(), pop.to_string()))
            }
        })
        .collect()
}

/// Estimate murine cell-population abundances with mMCP-counter.
///
/// Returns populations (rows) by samples (columns). Populations are emitted in
/// the fixed [`POPULATION_ORDER`]; a population is dropped if none of its
/// markers are present in `expr`, or if every resulting score is `NaN` (R's
/// all-`NA` row filter).
///
/// Divergence from R: `mMCPcounter.estimate` raises an error when *no* signature
/// feature matches the input row names. To stay consistent with the rest of this
/// crate (and to behave as a library rather than a script), this instead returns
/// an empty result.
pub fn estimate(
    expr: &ExprMatrix,
    features: MurineFeaturesType,
    genome: GenomeVersion,
) -> DeconvResult {
    let pairs = signature_pairs(genome, features);

    // Group the expression-row indices for each population, preserving marker
    // multiplicity: this mirrors `exp[localSig[, features], ]` followed by
    // grouping on `localSig$Denomination`. A feature listed k times contributes
    // its (first) matching expression row k times. R's by-name indexing takes
    // the first row for duplicate names, which `row_of` already does.
    let mut groups: HashMap<&str, Vec<usize>> = HashMap::new();
    for (id, pop) in &pairs {
        if let Some(row) = expr.row_of(id) {
            // Find the population's canonical &str key from POPULATION_ORDER so
            // both keys and lookups share the same lifetime/spelling.
            groups.entry(pop.as_str()).or_default().push(row);
        }
    }

    let ncol = expr.ncol();
    let mut populations: Vec<String> = Vec::new();
    let mut scores: Vec<f64> = Vec::new();
    let mut col_vals: Vec<f64> = Vec::new();

    for &pop in POPULATION_ORDER.iter() {
        let Some(rows) = groups.get(pop) else {
            continue; // absent population -> all-NA row -> dropped by R's filter
        };
        if rows.is_empty() {
            continue;
        }

        // Compute the per-sample median first, so the all-NA filter can be
        // applied exactly as R does (`sum(is.na(x)) < ncol`).
        let start = scores.len();
        let mut any_finite = false;
        for j in 0..ncol {
            col_vals.clear();
            for &i in rows {
                col_vals.push(expr.get(i, j));
            }
            let m = r_median_narm(&col_vals);
            if !m.is_nan() {
                any_finite = true;
            }
            scores.push(m);
        }

        if any_finite {
            populations.push(pop.to_string());
        } else {
            scores.truncate(start); // entirely-NA population: drop it
        }
    }

    DeconvResult {
        populations,
        samples: expr.col_names().to_vec(),
        scores,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn approx(a: f64, b: f64) {
        assert!((a - b).abs() < 1e-12, "{a} vs {b}");
    }

    #[test]
    fn gene_symbol_median_basic() {
        // "T cells" markers (GCRm39): Cd6, Cd3d, Cd3e. "CD8 T cells": Cd8b1.
        // Bg is a non-marker and must be ignored.
        let rows = vec![
            "Cd6".to_string(),
            "Cd3d".to_string(),
            "Cd3e".to_string(),
            "Cd8b1".to_string(),
            "Bg".to_string(),
        ];
        let cols = vec!["s1".to_string(), "s2".to_string()];
        #[rustfmt::skip]
        let data = vec![
            1.0, 10.0,  // Cd6
            2.0, 40.0,  // Cd3d
            3.0, 20.0,  // Cd3e
            7.0, 70.0,  // Cd8b1
            99.0, 99.0, // Bg (ignored)
        ];
        let m = ExprMatrix::new(rows, cols, data);
        let res = estimate(&m, MurineFeaturesType::GeneSymbol, GenomeVersion::GCRm39);

        // Output order is the fixed list: "T cells" then "CD8 T cells".
        assert_eq!(res.populations, vec!["T cells", "CD8 T cells"]);
        assert_eq!(res.samples, vec!["s1", "s2"]);
        // T cells s1: median(1,2,3)=2 ; s2: median(10,40,20)=20
        approx(res.score(0, 0), 2.0);
        approx(res.score(0, 1), 20.0);
        // CD8 T cells: single marker
        approx(res.score(1, 0), 7.0);
        approx(res.score(1, 1), 70.0);
    }

    #[test]
    fn marker_multiplicity_is_preserved() {
        // "Fibroblasts" lists Ccl19 three times; with two other distinct
        // markers present the median is over a 5-element multiset
        // {Kcne4, Ccl19, Ccl19, Ccl19, Agtr1a}. Sorted by value the median is
        // the Ccl19 value, demonstrating multiplicity matters.
        let rows = vec![
            "Kcne4".to_string(),  // 1.0
            "Agtr1a".to_string(), // 2.0
            "Ccl19".to_string(),  // 100.0 (x3 via signature multiplicity)
        ];
        let cols = vec!["s1".to_string()];
        let data = vec![1.0, 2.0, 100.0];
        let m = ExprMatrix::new(rows, cols, data);
        let res = estimate(&m, MurineFeaturesType::GeneSymbol, GenomeVersion::GCRm39);

        let idx = res
            .populations
            .iter()
            .position(|p| p == "Fibroblasts")
            .expect("Fibroblasts present");
        // multiset = [1, 2, 100, 100, 100] -> median = 100 (not (1+2)/.. etc).
        approx(res.score(idx, 0), 100.0);
    }

    #[test]
    fn absent_markers_yield_empty() {
        let rows = vec!["nope1".to_string(), "nope2".to_string()];
        let cols = vec!["s1".to_string()];
        let data = vec![1.0, 2.0];
        let m = ExprMatrix::new(rows, cols, data);
        let res = estimate(&m, MurineFeaturesType::GeneSymbol, GenomeVersion::GCRm39);
        assert!(res.populations.is_empty());
        assert!(res.scores.is_empty());
    }

    #[test]
    fn probes_feature_and_gcrm38_table() {
        // Probe 10584821 -> Cd3d -> "T cells" in both tables; probes are unique.
        let rows = vec!["10584821".to_string(), "10593024".to_string()];
        let cols = vec!["s1".to_string()];
        let data = vec![4.0, 8.0];
        let m = ExprMatrix::new(rows, cols, data);
        let res = estimate(&m, MurineFeaturesType::Probes, GenomeVersion::GCRm38);
        let idx = res
            .populations
            .iter()
            .position(|p| p == "T cells")
            .expect("T cells present");
        approx(res.score(idx, 0), 6.0); // median(4,8) = 6
    }
}