use std::collections::{HashMap, HashSet};
use crate::ambient_rna_detect::AmbientRnaDetect;
use crate::cell_data::GeneUmiHash;
use core::fmt;
use int_to_str::int_to_str::IntToStr;
#[derive(Clone, Debug, Default)]
pub struct MultiMapper {
pub features: HashSet<u64>,
pub positions: Vec<usize>,
}
#[derive(Clone, Debug, Default)]
pub struct CellData {
pub name: u64,
pub seen: HashSet<GeneUmiHash>,
pub total_reads: HashMap<u64, f32>,
pub multimapper: HashMap<u64, MultiMapper>,
}
impl fmt::Display for CellData {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "CellData Summary")?;
writeln!(f, "----------------")?;
writeln!(f, "Cell Name (ID): {}", self.name)?;
writeln!(f, "Total UMIs: {}", self.total_umis())?;
writeln!(f, "Seen Feature/UMI Pairs: {}", self.seen.len())?;
writeln!(f, "\nPer-Feature Totals:")?;
for (feature_id, count) in &self.total_reads {
writeln!(f, " {} => {}", feature_id, count)?;
}
writeln!(f, "\nSeen Feature/UMI Registry:")?;
for key in &self.seen {
writeln!(f, " {:?}", key)?;
}
writeln!(f, "\nMultimappers ({} entries):", self.multimapper.len())?;
for (seq_id, mm) in &self.multimapper {
writeln!(
f,
" Seq {}: Features {:?}, Positions {:?}",
seq_id, mm.features, mm.positions
)?;
}
Ok(())
}
}
impl CellData {
pub fn new(name: u64) -> Self {
Self {
name,
..Default::default()
}
}
pub fn split_ambient(&self, ambient: &AmbientRnaDetect) -> (Self, Self) {
let mut non_ambient = Self::new(self.name);
let mut ambient_data = Self::new(self.name);
for gh in &self.seen {
if ambient.is_ambient(gh) {
ambient_data.add(*gh);
} else {
non_ambient.add(*gh);
}
}
(non_ambient, ambient_data)
}
pub fn merge(&mut self, other: &CellData) {
for gh in &other.seen {
if !self.seen.insert(*gh) {
#[cfg(debug_assertions)]
eprintln!(
"Warning: duplicate GeneUmiHash {:?} found during merge for cell {}",
gh, self.name
);
continue;
}
*self.total_reads.entry(gh.0).or_insert(0.0) += 1.0;
}
}
pub fn add(&mut self, fh: GeneUmiHash) -> bool {
if !self.seen.insert(fh) {
#[cfg(debug_assertions)]
eprintln!(
"Warning: duplicate GeneUmiHash {:?} detected in cell {} — ignoring.",
fh, self.name
);
return false;
}
*self.total_reads.entry(fh.0).or_insert(0.0) += 1.0;
true
}
pub fn add_value(&mut self, fh: GeneUmiHash, value: f32) -> bool {
if !self.seen.insert(fh) {
#[cfg(debug_assertions)]
eprintln!(
"Warning: duplicate GeneUmiHash {:?} detected in cell {} — ignoring value insert.",
fh, self.name
);
return false;
}
*self.total_reads.entry(fh.0).or_insert(0.0) += value;
true
}
pub fn total_umis(&self) -> usize {
self.seen.len()
}
pub fn total_umis_4_gene_id(&self, feature_id: &u64) -> f32 {
*self.total_reads.get(feature_id).unwrap_or(&0.0)
}
pub fn get_mean_for_gene(&self, feature_id: &u64) -> Option<f32> {
let total = *self.total_reads.get(feature_id)?;
let n = self.seen.iter().filter(|gh| gh.0 == *feature_id).count();
if n > 0 { Some(total / n as f32) } else { None }
}
pub fn to_str_for_feature_ids(&self, feature_ids: &[u64], _length: usize) -> String {
let mut data = Vec::<String>::with_capacity(feature_ids.len() + 4);
data.push(IntToStr::u8_array_to_str(&self.name.to_le_bytes()).to_string());
let mut total = 0.0f32;
let mut max = 0.0f32;
let mut max_id: Option<u64> = None;
for id in feature_ids {
let n = *self.total_reads.get(id).unwrap_or(&0.0);
if n > max {
max = n;
max_id = Some(*id);
}
data.push(n.to_string());
total += n;
}
let mut dist2max = f32::MAX;
for id in feature_ids {
let n = *self.total_reads.get(id).unwrap_or(&0.0);
let diff = max - n;
if diff > 0.0 && diff < dist2max {
dist2max = diff;
}
}
data.push(
max_id
.map(|id| id.to_string())
.unwrap_or_else(|| "na".to_string()),
);
data.push(if total > 0.0 {
(max / total).to_string()
} else {
"0".to_string()
});
data.push(total.to_string());
data.push(if max > 0.0 && dist2max.is_finite() {
(dist2max / max).to_string()
} else {
"0".to_string()
});
data.join("\t")
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cell_data::GeneUmiHash;
#[test]
fn test_new_cell() {
let cell = CellData::new(42);
assert_eq!(cell.name, 42);
assert_eq!(cell.total_umis(), 0);
assert!(cell.seen.is_empty());
assert!(cell.total_reads.is_empty());
}
#[test]
fn test_add_unique_feature_umi() {
let mut cell = CellData::new(1);
let fh = GeneUmiHash(10, 123);
let inserted = cell.add(fh);
assert!(inserted);
assert_eq!(cell.total_umis(), 1);
assert!(cell.seen.contains(&fh));
assert_eq!(cell.total_reads.get(&10), Some(&1.0));
}
#[test]
fn test_add_duplicate_rejected() {
let mut cell = CellData::new(1);
let fh = GeneUmiHash(10, 123);
assert!(cell.add(fh));
assert!(!cell.add(fh));
assert_eq!(cell.total_umis(), 1);
assert_eq!(cell.total_reads.get(&10), Some(&1.0));
}
#[test]
fn test_add_value_new_only() {
let mut cell = CellData::new(1);
let fh = GeneUmiHash(5, 111);
assert!(cell.add_value(fh, 2.0));
assert!(!cell.add_value(fh, 3.0));
assert!(cell.seen.contains(&fh));
assert_eq!(cell.total_reads.get(&5), Some(&2.0));
assert_eq!(cell.total_umis(), 1);
}
#[test]
fn test_total_umis() {
let mut cell = CellData::new(1);
cell.add(GeneUmiHash(1, 10));
cell.add(GeneUmiHash(1, 11));
cell.add(GeneUmiHash(2, 12));
assert_eq!(cell.total_umis(), 3);
}
#[test]
fn test_merge_cells() {
let mut a = CellData::new(1);
let mut b = CellData::new(1);
let fh1 = GeneUmiHash(1, 10);
let fh2 = GeneUmiHash(2, 20);
a.add(fh1);
b.add(fh2);
a.merge(&b);
assert_eq!(a.total_umis(), 2);
assert!(a.seen.contains(&fh1));
assert!(a.seen.contains(&fh2));
assert_eq!(a.total_reads.get(&1), Some(&1.0));
assert_eq!(a.total_reads.get(&2), Some(&1.0));
}
#[test]
fn test_total_umis_for_feature() {
let mut cell = CellData::new(1);
cell.add(GeneUmiHash(5, 1));
cell.add(GeneUmiHash(5, 2));
cell.add(GeneUmiHash(3, 1));
assert_eq!(cell.total_umis_4_gene_id(&5), 2.0);
assert_eq!(cell.total_umis_4_gene_id(&3), 1.0);
assert_eq!(cell.total_umis_4_gene_id(&9), 0.0);
}
#[test]
fn test_mean_for_feature() {
let mut cell = CellData::new(1);
cell.add_value(GeneUmiHash(1, 1), 2.0);
cell.add_value(GeneUmiHash(1, 2), 4.0);
let mean = cell.get_mean_for_gene(&1).unwrap();
assert!((mean - 3.0).abs() < 1e-6);
}
}