use std::collections::HashMap;
use crate::matrix::ExprMatrix;
use crate::result::DeconvResult;
use crate::stats::r_median_narm;
const SIG_GCRM38: &str = include_str!("signatures/mmcpcounter_signatures_GCRm38.txt");
const SIG_GCRM39: &str = include_str!("signatures/mmcpcounter_signatures_GCRm39.txt");
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum GenomeVersion {
GCRm38,
GCRm39,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum MurineFeaturesType {
GeneSymbol,
EnsemblId,
Probes,
}
const POPULATION_ORDER: [&str; 16] = [
"T cells",
"CD8 T cells",
"NK cells",
"B derived",
"Memory B cells",
"Monocytes / macrophages",
"Monocytes",
"Granulocytes",
"Mast cells",
"Eosinophils",
"Neutrophils",
"Basophils",
"Vessels",
"Lymphatics",
"Endothelial cells",
"Fibroblasts",
];
fn unquote(s: &str) -> &str {
let s = s.trim();
s.strip_prefix('"')
.and_then(|x| x.strip_suffix('"'))
.unwrap_or(s)
}
fn signature_pairs(genome: GenomeVersion, features: MurineFeaturesType) -> Vec<(String, String)> {
let tsv = match genome {
GenomeVersion::GCRm38 => SIG_GCRM38,
GenomeVersion::GCRm39 => SIG_GCRM39,
};
let feat_col = match features {
MurineFeaturesType::Probes => 0,
MurineFeaturesType::GeneSymbol => 2,
MurineFeaturesType::EnsemblId => 3,
};
const DENOM_COL: usize = 1;
tsv.lines()
.skip(1) .filter(|l| !l.trim().is_empty())
.filter_map(|l| {
let fields: Vec<&str> = l.split('\t').collect();
let id = unquote(fields.get(feat_col).copied().unwrap_or(""));
let pop = unquote(fields.get(DENOM_COL).copied().unwrap_or(""));
if id.is_empty() || id == "NA" {
None
} else {
Some((id.to_string(), pop.to_string()))
}
})
.collect()
}
pub fn estimate(
expr: &ExprMatrix,
features: MurineFeaturesType,
genome: GenomeVersion,
) -> DeconvResult {
let pairs = signature_pairs(genome, features);
let mut groups: HashMap<&str, Vec<usize>> = HashMap::new();
for (id, pop) in &pairs {
if let Some(row) = expr.row_of(id) {
groups.entry(pop.as_str()).or_default().push(row);
}
}
let ncol = expr.ncol();
let mut populations: Vec<String> = Vec::new();
let mut scores: Vec<f64> = Vec::new();
let mut col_vals: Vec<f64> = Vec::new();
for &pop in POPULATION_ORDER.iter() {
let Some(rows) = groups.get(pop) else {
continue; };
if rows.is_empty() {
continue;
}
let start = scores.len();
let mut any_finite = false;
for j in 0..ncol {
col_vals.clear();
for &i in rows {
col_vals.push(expr.get(i, j));
}
let m = r_median_narm(&col_vals);
if !m.is_nan() {
any_finite = true;
}
scores.push(m);
}
if any_finite {
populations.push(pop.to_string());
} else {
scores.truncate(start); }
}
DeconvResult {
populations,
samples: expr.col_names().to_vec(),
scores,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn approx(a: f64, b: f64) {
assert!((a - b).abs() < 1e-12, "{a} vs {b}");
}
#[test]
fn gene_symbol_median_basic() {
let rows = vec![
"Cd6".to_string(),
"Cd3d".to_string(),
"Cd3e".to_string(),
"Cd8b1".to_string(),
"Bg".to_string(),
];
let cols = vec!["s1".to_string(), "s2".to_string()];
#[rustfmt::skip]
let data = vec![
1.0, 10.0, 2.0, 40.0, 3.0, 20.0, 7.0, 70.0, 99.0, 99.0, ];
let m = ExprMatrix::new(rows, cols, data);
let res = estimate(&m, MurineFeaturesType::GeneSymbol, GenomeVersion::GCRm39);
assert_eq!(res.populations, vec!["T cells", "CD8 T cells"]);
assert_eq!(res.samples, vec!["s1", "s2"]);
approx(res.score(0, 0), 2.0);
approx(res.score(0, 1), 20.0);
approx(res.score(1, 0), 7.0);
approx(res.score(1, 1), 70.0);
}
#[test]
fn marker_multiplicity_is_preserved() {
let rows = vec![
"Kcne4".to_string(), "Agtr1a".to_string(), "Ccl19".to_string(), ];
let cols = vec!["s1".to_string()];
let data = vec![1.0, 2.0, 100.0];
let m = ExprMatrix::new(rows, cols, data);
let res = estimate(&m, MurineFeaturesType::GeneSymbol, GenomeVersion::GCRm39);
let idx = res
.populations
.iter()
.position(|p| p == "Fibroblasts")
.expect("Fibroblasts present");
approx(res.score(idx, 0), 100.0);
}
#[test]
fn absent_markers_yield_empty() {
let rows = vec!["nope1".to_string(), "nope2".to_string()];
let cols = vec!["s1".to_string()];
let data = vec![1.0, 2.0];
let m = ExprMatrix::new(rows, cols, data);
let res = estimate(&m, MurineFeaturesType::GeneSymbol, GenomeVersion::GCRm39);
assert!(res.populations.is_empty());
assert!(res.scores.is_empty());
}
#[test]
fn probes_feature_and_gcrm38_table() {
let rows = vec!["10584821".to_string(), "10593024".to_string()];
let cols = vec!["s1".to_string()];
let data = vec![4.0, 8.0];
let m = ExprMatrix::new(rows, cols, data);
let res = estimate(&m, MurineFeaturesType::Probes, GenomeVersion::GCRm38);
let idx = res
.populations
.iter()
.position(|p| p == "T cells")
.expect("T cells present");
approx(res.score(idx, 0), 6.0); }
}