mcpcounter-rust 0.1.0

//! MCP-counter (Becht et al. 2016): marker-gene abundance estimation.
//!
//! Port of `MCPcounter.estimate` from the GPL-3 MCPcounter R package
//! (<https://github.com/ebecht/MCPcounter>). For each cell population, the score
//! in a sample is the arithmetic mean (R two-pass, `na.rm`) of that population's
//! marker features that are present in the expression matrix. No internal
//! transform or normalization is applied, matching the R implementation.
//!
//! The bundled marker signatures (`signatures/mcpcounter_*.txt`) are the
//! upstream CC0 / public-domain data files.

use std::collections::{HashMap, HashSet};

use crate::matrix::ExprMatrix;
use crate::result::DeconvResult;
use crate::stats::r_mean_narm;

const GENES_TSV: &str = include_str!("signatures/mcpcounter_genes.txt");
const PROBESETS_TSV: &str = include_str!("signatures/mcpcounter_probesets.txt");

/// Feature identifier namespace used to match markers against the expression
/// matrix row names. Mirrors `featuresType` in the R package.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum FeaturesType {
    /// Affymetrix HG 133 Plus 2.0 probesets (the R default).
    Affy133P2Probesets,
    /// Official HUGO gene symbols.
    HugoSymbols,
    /// Entrez gene IDs.
    EntrezId,
    /// Ensembl gene IDs.
    EnsemblId,
}

/// Strip a single pair of surrounding double quotes (R `write.table` style).
fn unquote(s: &str) -> &str {
    let s = s.trim();
    s.strip_prefix('"')
        .and_then(|x| x.strip_suffix('"'))
        .unwrap_or(s)
}

/// `(marker id, population)` pairs for the requested feature namespace, in the
/// row order of the signature file.
fn signature_pairs(ft: FeaturesType) -> Vec<(String, String)> {
    if ft == FeaturesType::Affy133P2Probesets {
        // Headerless TSV: probeset, population.
        return PROBESETS_TSV
            .lines()
            .filter(|l| !l.trim().is_empty())
            .map(|l| {
                let mut it = l.split('\t');
                let id = unquote(it.next().unwrap_or(""));
                let pop = unquote(it.next().unwrap_or(""));
                (id.to_string(), pop.to_string())
            })
            .collect();
    }

    // genes.txt header: "HUGO symbols"\t"Cell population"\t"ENTREZID"\t"ENSEMBL ID"
    let col = match ft {
        FeaturesType::HugoSymbols => 0,
        FeaturesType::EntrezId => 2,
        FeaturesType::EnsemblId => 3,
        FeaturesType::Affy133P2Probesets => unreachable!(),
    };
    GENES_TSV
        .lines()
        .skip(1) // header
        .filter(|l| !l.trim().is_empty())
        .filter_map(|l| {
            let fields: Vec<&str> = l.split('\t').collect();
            let id = unquote(fields.get(col).copied().unwrap_or(""));
            let pop = unquote(fields.get(1).copied().unwrap_or(""));
            // Unquoted `NA` (e.g. missing Ensembl ID) is never a valid feature.
            if id.is_empty() || id == "NA" {
                None
            } else {
                Some((id.to_string(), pop.to_string()))
            }
        })
        .collect()
}

/// Estimate cell-population abundances with MCP-counter.
///
/// Returns populations (rows) by samples (columns). A population is emitted only
/// if at least one of its markers is present in `expr`; populations appear in
/// their first-appearance order in the signature table.
pub fn estimate(expr: &ExprMatrix, features_type: FeaturesType) -> DeconvResult {
    let pairs = signature_pairs(features_type);

    // Population output order = first appearance in the signature table.
    let mut pop_order: Vec<&str> = Vec::new();
    let mut seen: HashSet<&str> = HashSet::new();
    for (_, pop) in &pairs {
        if seen.insert(pop.as_str()) {
            pop_order.push(pop.as_str());
        }
    }

    // Resolve each population's marker rows by direct lookup in the matrix's row
    // index (O(markers)), rather than scanning every row name once per
    // population (the old O(genes x populations) cost). `pairs` is the signature
    // in table order; `row_of` returns the first matrix row for a name.
    let mut pop_rows: HashMap<&str, Vec<usize>> = HashMap::new();
    for (id, pop) in &pairs {
        if let Some(row) = expr.row_of(id) {
            pop_rows.entry(pop.as_str()).or_default().push(row);
        }
    }
    // Sort to ascending row position and de-duplicate to one row per marker: this
    // reproduces R's `expr[intersect(rownames(expr), markers), ]` (row-name
    // order, first row per name). The order matters — the two-pass mean is
    // summation-order sensitive at the last ULP — so it must match R to stay
    // bit-identical.
    for rows in pop_rows.values_mut() {
        rows.sort_unstable();
        rows.dedup();
    }

    let ncol = expr.ncol();
    let mut populations: Vec<String> = Vec::new();
    let mut scores: Vec<f64> = Vec::new();
    let mut col_vals: Vec<f64> = Vec::new();

    for pop in &pop_order {
        let Some(rows) = pop_rows.get(pop) else {
            continue;
        };
        if rows.is_empty() {
            continue;
        }

        populations.push((*pop).to_string());
        for j in 0..ncol {
            col_vals.clear();
            for &i in rows {
                col_vals.push(expr.get(i, j));
            }
            scores.push(r_mean_narm(&col_vals));
        }
    }

    DeconvResult {
        populations,
        samples: expr.col_names().to_vec(),
        scores,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn approx(a: f64, b: f64) {
        assert!((a - b).abs() < 1e-12, "{a} vs {b}");
    }

    #[test]
    fn hugo_symbols_marker_means() {
        // CD3D/CD3G/CD28 are "T cells" markers; CD8B is a "CD8 T cells" marker;
        // GAPDH is not a marker. (See signatures/mcpcounter_genes.txt.)
        let rows = vec![
            "CD3D".to_string(),
            "CD3G".to_string(),
            "CD28".to_string(),
            "CD8B".to_string(),
            "GAPDH".to_string(),
        ];
        let cols = vec!["s1".to_string(), "s2".to_string()];
        #[rustfmt::skip]
        let data = vec![
            10.0, 20.0, // CD3D
            12.0, 18.0, // CD3G
            14.0, 22.0, // CD28
            5.0,  7.0,  // CD8B
            100.0, 200.0, // GAPDH (ignored)
        ];
        let m = ExprMatrix::new(rows, cols, data);
        let res = estimate(&m, FeaturesType::HugoSymbols);

        // genes.txt order => "T cells" before "CD8 T cells".
        assert_eq!(res.populations, vec!["T cells", "CD8 T cells"]);
        assert_eq!(res.samples, vec!["s1", "s2"]);
        // T cells: mean(10,12,14)=12 ; mean(20,18,22)=20
        approx(res.score(0, 0), 12.0);
        approx(res.score(0, 1), 20.0);
        // CD8 T cells: single marker
        approx(res.score(1, 0), 5.0);
        approx(res.score(1, 1), 7.0);
    }

    #[test]
    fn probesets_default_smoke() {
        // Two "T cells" probesets from signatures/mcpcounter_probesets.txt.
        let rows = vec!["204777_s_at".to_string(), "206485_at".to_string()];
        let cols = vec!["a".to_string()];
        let data = vec![10.0, 30.0];
        let m = ExprMatrix::new(rows, cols, data);
        let res = estimate(&m, FeaturesType::Affy133P2Probesets);
        assert_eq!(res.populations, vec!["T cells"]);
        approx(res.score(0, 0), 20.0);
    }

    #[test]
    fn absent_markers_drop_population() {
        // No known markers => no populations.
        let rows = vec!["FOO".to_string(), "BAR".to_string()];
        let cols = vec!["s1".to_string()];
        let data = vec![1.0, 2.0];
        let m = ExprMatrix::new(rows, cols, data);
        let res = estimate(&m, FeaturesType::HugoSymbols);
        assert!(res.populations.is_empty());
        assert!(res.scores.is_empty());
    }
}