use std::collections::{HashMap, HashSet};
use crate::matrix::ExprMatrix;
use crate::result::DeconvResult;
use crate::stats::r_mean_narm;
const GENES_TSV: &str = include_str!("signatures/mcpcounter_genes.txt");
const PROBESETS_TSV: &str = include_str!("signatures/mcpcounter_probesets.txt");
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum FeaturesType {
Affy133P2Probesets,
HugoSymbols,
EntrezId,
EnsemblId,
}
fn unquote(s: &str) -> &str {
let s = s.trim();
s.strip_prefix('"')
.and_then(|x| x.strip_suffix('"'))
.unwrap_or(s)
}
fn signature_pairs(ft: FeaturesType) -> Vec<(String, String)> {
if ft == FeaturesType::Affy133P2Probesets {
return PROBESETS_TSV
.lines()
.filter(|l| !l.trim().is_empty())
.map(|l| {
let mut it = l.split('\t');
let id = unquote(it.next().unwrap_or(""));
let pop = unquote(it.next().unwrap_or(""));
(id.to_string(), pop.to_string())
})
.collect();
}
let col = match ft {
FeaturesType::HugoSymbols => 0,
FeaturesType::EntrezId => 2,
FeaturesType::EnsemblId => 3,
FeaturesType::Affy133P2Probesets => unreachable!(),
};
GENES_TSV
.lines()
.skip(1) .filter(|l| !l.trim().is_empty())
.filter_map(|l| {
let fields: Vec<&str> = l.split('\t').collect();
let id = unquote(fields.get(col).copied().unwrap_or(""));
let pop = unquote(fields.get(1).copied().unwrap_or(""));
if id.is_empty() || id == "NA" {
None
} else {
Some((id.to_string(), pop.to_string()))
}
})
.collect()
}
pub fn estimate(expr: &ExprMatrix, features_type: FeaturesType) -> DeconvResult {
let pairs = signature_pairs(features_type);
let mut pop_order: Vec<&str> = Vec::new();
let mut seen: HashSet<&str> = HashSet::new();
for (_, pop) in &pairs {
if seen.insert(pop.as_str()) {
pop_order.push(pop.as_str());
}
}
let mut pop_rows: HashMap<&str, Vec<usize>> = HashMap::new();
for (id, pop) in &pairs {
if let Some(row) = expr.row_of(id) {
pop_rows.entry(pop.as_str()).or_default().push(row);
}
}
for rows in pop_rows.values_mut() {
rows.sort_unstable();
rows.dedup();
}
let ncol = expr.ncol();
let mut populations: Vec<String> = Vec::new();
let mut scores: Vec<f64> = Vec::new();
let mut col_vals: Vec<f64> = Vec::new();
for pop in &pop_order {
let Some(rows) = pop_rows.get(pop) else {
continue;
};
if rows.is_empty() {
continue;
}
populations.push((*pop).to_string());
for j in 0..ncol {
col_vals.clear();
for &i in rows {
col_vals.push(expr.get(i, j));
}
scores.push(r_mean_narm(&col_vals));
}
}
DeconvResult {
populations,
samples: expr.col_names().to_vec(),
scores,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn approx(a: f64, b: f64) {
assert!((a - b).abs() < 1e-12, "{a} vs {b}");
}
#[test]
fn hugo_symbols_marker_means() {
let rows = vec![
"CD3D".to_string(),
"CD3G".to_string(),
"CD28".to_string(),
"CD8B".to_string(),
"GAPDH".to_string(),
];
let cols = vec!["s1".to_string(), "s2".to_string()];
#[rustfmt::skip]
let data = vec![
10.0, 20.0, 12.0, 18.0, 14.0, 22.0, 5.0, 7.0, 100.0, 200.0, ];
let m = ExprMatrix::new(rows, cols, data);
let res = estimate(&m, FeaturesType::HugoSymbols);
assert_eq!(res.populations, vec!["T cells", "CD8 T cells"]);
assert_eq!(res.samples, vec!["s1", "s2"]);
approx(res.score(0, 0), 12.0);
approx(res.score(0, 1), 20.0);
approx(res.score(1, 0), 5.0);
approx(res.score(1, 1), 7.0);
}
#[test]
fn probesets_default_smoke() {
let rows = vec!["204777_s_at".to_string(), "206485_at".to_string()];
let cols = vec!["a".to_string()];
let data = vec![10.0, 30.0];
let m = ExprMatrix::new(rows, cols, data);
let res = estimate(&m, FeaturesType::Affy133P2Probesets);
assert_eq!(res.populations, vec!["T cells"]);
approx(res.score(0, 0), 20.0);
}
#[test]
fn absent_markers_drop_population() {
let rows = vec!["FOO".to_string(), "BAR".to_string()];
let cols = vec!["s1".to_string()];
let data = vec![1.0, 2.0];
let m = ExprMatrix::new(rows, cols, data);
let res = estimate(&m, FeaturesType::HugoSymbols);
assert!(res.populations.is_empty());
assert!(res.scores.is_empty());
}
}