use crate::consistent_transcripts::{Ec2TranscriptMapper, TranscriptId, Transcriptname};
use crate::io::BusFolder;
use crate::{disjoint::Intersector, io::BusRecord};
use itertools::izip;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::hash::Hash;
use std::io::{BufReader, BufRead};
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Copy, Clone, Debug)]
pub struct EC(pub u32);
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Copy, Clone, Debug)]
pub struct GeneId(pub u32);
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Copy, Clone, Debug)]
pub struct CB(pub u64);
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Clone, Debug)]
pub struct Genename(pub String);
pub (crate) fn update_intersection_via_retain<T: Hash + Eq>(inter: &mut HashSet<T>, newset: &HashSet<T>) {
inter.retain(|item| newset.contains(item));
}
#[derive(Debug, PartialEq)]
pub enum MappingResult {
SingleGene(GeneId),
Multimapped(HashSet<GeneId>),
Inconsistent,
}
pub enum MappingMode {
EC(InconsistentResolution), Gene(Ec2GeneMapper, InconsistentResolution), Transcript(Ec2TranscriptMapper, InconsistentResolution), }
#[derive(Debug)]
pub enum InconsistentResolution {
IgnoreInconsistent, AsDistinct, AsSingle, }
pub fn find_consistent(records: &[BusRecord], ec2gene: &Ec2GeneMapper) -> MappingResult {
let mut setlist = records.iter().map(|r| ec2gene.get_genes(EC(r.EC)));
let s1 = setlist.next().unwrap();
let mut shared_genes = s1.clone();
if records.len() == 1 {
if shared_genes.len() == 1 {
let elem = *shared_genes.iter().next().unwrap();
return MappingResult::SingleGene(elem);
} else {
return MappingResult::Multimapped(shared_genes);
}
}
for current_set in setlist {
update_intersection_via_retain(&mut shared_genes, current_set);
if shared_genes.is_empty() {
break;
}
}
match shared_genes.len() {
0 => MappingResult::Inconsistent,
1 => {
let elem = *shared_genes.iter().next().unwrap();
MappingResult::SingleGene(elem)
}
_ => MappingResult::Multimapped(shared_genes),
}
}
#[derive(Debug, Clone)]
pub struct Ec2GeneMapper {
ec2geneid: HashMap<EC, HashSet<GeneId>>,
int_to_gene: HashMap<GeneId, Genename>,
}
impl Ec2GeneMapper {
pub fn new(ec2gene: HashMap<EC, HashSet<Genename>>) -> Self {
let mut gene_list: HashSet<Genename> = HashSet::new();
for genes in ec2gene.values() {
for g in genes {
gene_list.insert(g.clone());
}
}
let mut gene_vector: Vec<Genename> = gene_list.into_iter().collect();
gene_vector.sort();
let gene_to_int: HashMap<Genename, GeneId> = gene_vector
.into_iter()
.enumerate()
.map(|(i, g)| (g, GeneId(i as u32)))
.collect();
let int_to_gene: HashMap<GeneId, Genename> =
gene_to_int.iter().map(|(g, i)| (*i, g.clone())).collect();
let mut ec2geneid: HashMap<EC, HashSet<GeneId>> = HashMap::new();
for (ec, genes) in ec2gene.iter() {
let geneids: HashSet<GeneId> = genes
.iter()
.map(|gname| *gene_to_int.get(gname).unwrap())
.collect();
ec2geneid.insert(*ec, geneids);
}
Ec2GeneMapper { ec2geneid, int_to_gene }
}
pub fn get_genes(&self, ec: EC) -> &HashSet<GeneId> {
self.ec2geneid.get(&ec).unwrap()
}
pub fn get_genenames(&self, ec: EC) -> HashSet<Genename> {
let geneids = self.get_genes(ec);
let genenames = geneids
.iter()
.map(|gid| self.resolve_gene_id(*gid))
.collect();
genenames
}
pub fn resolve_gene_id(&self, gene_id: GeneId) -> Genename {
let r = self.int_to_gene.get(&gene_id).unwrap();
r.clone()
}
pub fn get_gene_list(&self) -> Vec<Genename> {
let ngenes = self.int_to_gene.len();
let genelist_vector: Vec<Genename> = (0..ngenes)
.map(|k| self.resolve_gene_id(GeneId(k as u32)))
.collect();
genelist_vector
}
}
pub (crate) fn make_mapper(busfolder: &BusFolder, t2g_file: &str) -> Ec2GeneMapper{
let t2g_dict = parse_t2g(t2g_file);
let e2g = build_ec2gene(
&busfolder.parse_ecmatrix(),
&busfolder.parse_transcript(),
&t2g_dict);
Ec2GeneMapper::new(e2g)
}
fn build_ec2gene(
ec_dict: &HashMap<EC, Vec<TranscriptId>>,
transcript_dict: &HashMap<TranscriptId, Transcriptname>,
t2g_dict: &HashMap<Transcriptname, Genename>,
) -> HashMap<EC, HashSet<Genename>> {
let mut ec2gene: HashMap<EC, HashSet<Genename>> = HashMap::new();
for (ec, transcript_ints) in ec_dict.iter() {
let mut genes: HashSet<Genename> = HashSet::new();
for t_int in transcript_ints {
let t_name = transcript_dict.get(t_int).unwrap();
if let Some(genename) = t2g_dict.get(t_name) {
genes.insert(genename.clone());
}
}
ec2gene.insert(*ec, genes);
}
ec2gene
}
fn parse_t2g(t2g_file: &str) -> HashMap<Transcriptname, Genename> {
let mut t2g_dict: HashMap<Transcriptname, Genename> = HashMap::new();
let file = File::open(t2g_file).unwrap_or_else(|_| panic!("{} not found", t2g_file));
let reader = BufReader::new(file);
for line in reader.lines() {
if let Ok(l) = line {
let mut s = l.split_whitespace();
let transcript_id = s.next().unwrap();
let ensemble_id = s.next().unwrap();
let _symbol = s.next().unwrap();
let tname = Transcriptname(transcript_id.to_string());
assert!(!t2g_dict.contains_key(&tname)); t2g_dict.insert(tname, Genename(ensemble_id.to_string()));
} else {
panic!("Error readin lines from {}", t2g_file);
}
}
t2g_dict
}
#[derive(Debug)]
#[allow(non_snake_case)]
pub struct CUGset {
pub CB: u64, pub UMI: u64, pub GENESET: HashSet<Genename>, pub COUNT: u32, }
pub fn groubygene(records: Vec<BusRecord>, ec2gene: &Ec2GeneMapper) -> Vec<CUGset> {
let mut emissions: Vec<CUGset> = Vec::with_capacity(records.len());
let mut inter: Intersector<Genename, BusRecord> = Intersector::new();
for r in records {
let gset = ec2gene.get_genenames(EC(r.EC));
inter.add(gset, r);
}
for (gset, grouped_records) in izip!(inter.keys, inter.items) {
let counts: u32 = grouped_records.iter().map(|x| x.COUNT).sum();
let r1 = grouped_records.first().unwrap();
let new_record = CUGset { CB: r1.CB, UMI: r1.UMI, GENESET: gset, COUNT: counts };
emissions.push(new_record);
}
emissions
}
#[cfg(test)]
mod testing {
use crate::{
consistent_genes::{find_consistent, groubygene, Genename, MappingResult},
io::BusRecord,
utils::vec2set,
};
use std::collections::{HashMap, HashSet};
use super::{Ec2GeneMapper, EC};
#[test]
fn test_consistent() {
let ec0 = vec2set(vec![Genename("A".to_string())]);
let ec1 = vec2set(vec![Genename("B".to_string())]);
let ec2 = vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
let ec3 = vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);
let ec_dict: HashMap<EC, HashSet<Genename>> =
HashMap::from([(EC(0), ec0), (EC(1), ec1), (EC(2), ec2), (EC(3), ec3)]);
let es_mapper = Ec2GeneMapper::new(ec_dict);
let r1 = BusRecord { CB: 0, UMI: 21, EC: 0, COUNT: 2, FLAG: 0 };
let res1: MappingResult = find_consistent(&vec![r1], &es_mapper);
match res1 {
MappingResult::SingleGene(g) => {
assert_eq!(Genename("A".to_string()), es_mapper.resolve_gene_id(g))
}
MappingResult::Multimapped(_) | MappingResult::Inconsistent => panic!(),
}
let r2 = BusRecord { CB: 0, UMI: 21, EC: 2, COUNT: 2, FLAG: 0 };
let res2: MappingResult = find_consistent(&vec![r2], &es_mapper);
println!("{:?}", res2);
match res2 {
MappingResult::SingleGene(_) | MappingResult::Inconsistent => panic!(),
MappingResult::Multimapped(gs) => {
let obs = gs
.iter()
.map(|g| es_mapper.resolve_gene_id(*g))
.collect::<HashSet<_>>();
assert_eq!(
obs,
vec2set(vec![Genename("A".to_string()), Genename("B".to_string())])
)
}
}
let r3 = BusRecord { CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
let r4 = BusRecord { CB: 3, UMI: 0, EC: 2, COUNT: 2, FLAG: 0 };
let res3: MappingResult = find_consistent(&vec![r3, r4], &es_mapper);
match res3 {
MappingResult::SingleGene(g) => {
assert_eq!(Genename("A".to_string()), es_mapper.resolve_gene_id(g))
}
MappingResult::Multimapped(_) | MappingResult::Inconsistent => panic!(),
}
let r5 = BusRecord { CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
let r6 = BusRecord { CB: 3, UMI: 0, EC: 1, COUNT: 2, FLAG: 0 };
let res4 = find_consistent(&vec![r5, r6], &es_mapper);
assert_eq!(res4, MappingResult::Inconsistent);
let r7 = BusRecord { CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
let r8 = BusRecord { CB: 3, UMI: 0, EC: 1, COUNT: 2, FLAG: 0 };
let r9 = BusRecord { CB: 3, UMI: 0, EC: 2, COUNT: 2, FLAG: 0 };
let res5 = find_consistent(&vec![r7, r8, r9], &es_mapper);
assert_eq!(res5, MappingResult::Inconsistent)
}
#[test]
fn test_ec_struct() {
let ec0 = vec2set(vec![Genename("A".to_string())]);
let ec1 = vec2set(vec![Genename("B".to_string())]);
let ec2 = vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
let ec3 = vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);
let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
(EC(0), ec0.clone()),
(EC(1), ec1.clone()),
(EC(2), ec2.clone()),
(EC(3), ec3.clone()),
]);
let es = Ec2GeneMapper::new(ec_dict);
for (i, ec_set) in vec![ec0, ec1, ec2, ec3].iter().enumerate() {
let gids = es.get_genes(EC(i as u32));
let gnames: HashSet<_> = gids.iter().map(|i| es.resolve_gene_id(*i)).collect();
assert_eq!(&gnames, ec_set);
}
}
#[test]
fn test_get_gene_list() {
let ec0 = vec2set(vec![Genename("A".to_string())]);
let ec1 = vec2set(vec![Genename("B".to_string())]);
let ec2 = vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
let ec3 = vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);
let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
(EC(0), ec0.clone()),
(EC(1), ec1.clone()),
(EC(2), ec2.clone()),
(EC(3), ec3.clone()),
]);
let es = Ec2GeneMapper::new(ec_dict);
assert_eq!(
es.get_gene_list(),
vec![
Genename("A".to_string()),
Genename("B".to_string()),
Genename("C".to_string()),
Genename("D".to_string())
]
)
}
#[test]
fn test_groubygene() {
let ec0: HashSet<Genename> = vec2set(vec![Genename("A".to_string())]);
let ec1: HashSet<Genename> = vec2set(vec![Genename("B".to_string())]);
let ec2: HashSet<Genename> =
vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
let ec3: HashSet<Genename> =
vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);
let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
(EC(0), ec0.clone()),
(EC(1), ec1.clone()),
(EC(2), ec2.clone()),
(EC(3), ec3.clone()),
]);
let es = Ec2GeneMapper::new(ec_dict);
let r1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0 };
let r = groubygene(vec![r1, r2], &es);
assert_eq!(r.len(), 1);
let r1 = r.first().unwrap();
assert_eq!(r1.COUNT, 4);
assert_eq!(
r1.GENESET,
vec![Genename("A".to_string())]
.into_iter()
.collect::<HashSet<Genename>>()
);
let s1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0 }; let s2 = BusRecord { CB: 0, UMI: 1, EC: 1, COUNT: 4, FLAG: 0 }; let r = groubygene(vec![s1, s2], &es);
assert_eq!(r.len(), 2);
let r1 = &r[0];
let r2 = &r[1];
assert_eq!(r1.COUNT, 3);
assert_eq!(r2.COUNT, 4);
}
}