use crate::countmatrix::CountMatrix;
use bustools::consistent_genes::{find_consistent, Ec2GeneMapper, Genename, MappingResult, CB, MappingMode, InconsistentResolution};
use bustools::io::{group_record_by_cb_umi, BusFolder, BusReader, BusRecord};
use bustools::iterators::CellGroupIterator;
use bustools::utils::{get_progressbar, int_to_seq};
use sprs;
use std::collections::HashMap;
use std::time::Instant;
type ExpressionVector = HashMap<Genename, u32>;
#[allow(dead_code)]
fn count_bayesian(bfolder: BusFolder) {
let bfile = bfolder.get_busfile();
println!("{}", bfile);
let busiter = BusReader::new(&bfile);
println!("Getting counts");
let counts: Vec<f64> = busiter.map(|record| record.COUNT as f64).collect();
let total_counts: f64 = counts.iter().sum();
println!(
"counts total {}, Entries {}",
total_counts as u64,
counts.len()
);
println!("summing");
}
pub fn count(bfolder: &BusFolder, mapping_mode: MappingMode, ignore_multimapped: bool) -> CountMatrix {
let cb_iter = bfolder.get_iterator().groupby_cb();
println!("determine size of iterator");
let now = Instant::now();
let total_records = bfolder.get_cb_size();
let elapsed_time: std::time::Duration = now.elapsed();
println!(
"determined size of iterator {} in {:?}",
total_records, elapsed_time
);
let (ecmapper, inconstsistent_mode) = match mapping_mode {
MappingMode::EC(_) => panic!("not implemented"),
MappingMode::Gene(ecmapper, inconstsistent_mode) => {(ecmapper, inconstsistent_mode)}
};
let mut all_expression_vector: HashMap<CB, ExpressionVector> = HashMap::new();
let now = Instant::now();
let bar = get_progressbar(total_records as u64);
for (counter, (cb, record_list)) in cb_iter.enumerate() {
let s = records_to_expression_vector(record_list, &ecmapper, ignore_multimapped);
all_expression_vector.insert(CB(cb), s);
if counter % 10_000 == 0 {
bar.inc(10_000)
}
}
let elapsed_time = now.elapsed();
println!("done in {:?}", elapsed_time);
let genelist_vector: Vec<Genename> = ecmapper.get_gene_list();
println!(" genes {}", genelist_vector.len());
let mut genelist_vector2 = genelist_vector.iter().collect::<Vec<&Genename>>();
genelist_vector2.sort();
let countmatrix = expression_vectors_to_matrix(all_expression_vector, genelist_vector2);
println!("{}", countmatrix);
countmatrix
}
fn records_to_expression_vector(
record_list: Vec<BusRecord>,
eg_mapper: &Ec2GeneMapper,
ignore_multimapped: bool,
) -> ExpressionVector {
let mut expression_vector: ExpressionVector = HashMap::new(); let mut _multimapped = 0_u32;
let mut _inconsistant = 0_u32;
let cb_umi_grouped = group_record_by_cb_umi(record_list);
for ((_cb, _umi), records) in cb_umi_grouped {
let m: MappingResult = if ignore_multimapped {
match records.len() {
1 => find_consistent(&records, eg_mapper), 0 => panic!(),
_ => MappingResult::Inconsistent, }
} else {
find_consistent(&records, eg_mapper)
};
match m {
MappingResult::SingleGene(g) => {
let gname = eg_mapper.resolve_gene_id(g);
let val = expression_vector.entry(gname).or_insert(0);
*val += 1;
}
MappingResult::Multimapped(_) => _multimapped += 1,
MappingResult::Inconsistent => _inconsistant += 1,
}
}
expression_vector
}
fn expression_vectors_to_matrix(
all_expression_vector: HashMap<CB, ExpressionVector>,
genelist: Vec<&Genename>,
) -> CountMatrix {
let mut ii: Vec<usize> = Vec::new();
let mut jj: Vec<usize> = Vec::new();
let mut vv: Vec<i32> = Vec::new();
let mut cbs: Vec<CB> = Vec::new();
let mut gene2index: HashMap<&Genename, usize> = HashMap::new();
for (i, g) in genelist.iter().enumerate() {
gene2index.insert(g, i);
}
for (i, (cb, expr_vec)) in all_expression_vector.iter().enumerate() {
for (gene, count) in expr_vec {
ii.push(i);
let gindex = gene2index
.get(gene)
.unwrap_or_else(|| panic!("{:?} not found", gene));
jj.push(*gindex);
vv.push(*count as i32)
}
cbs.push(*cb)
}
let c: sprs::TriMat<i32> = sprs::TriMat::from_triplets((cbs.len(), genelist.len()), ii, jj, vv);
let b: sprs::CsMat<_> = c.to_csr();
let cbs_seq: Vec<String> = cbs.into_iter().map(|x| int_to_seq(x.0, 16)).collect();
let gene_seq: Vec<String> = genelist.into_iter().map(|x| x.0.to_string()).collect();
CountMatrix::new(b, cbs_seq, gene_seq)
}
#[cfg(test)]
mod test {
use super::count;
use crate::{count::records_to_expression_vector, count2::countmap_to_matrix};
use bustools::{
consistent_genes::{Ec2GeneMapper, GeneId, Genename, CB, EC, MappingMode, InconsistentResolution},
io::{setup_busfile, BusFolder, BusRecord},
utils::vec2set,
};
use std::collections::{HashMap, HashSet};
#[test]
fn test_records_to_expression_vector() {
let ec0: HashSet<Genename> =
vec2set(vec![Genename("G1".to_string()), Genename("G2".to_string())]);
let ec1: HashSet<Genename> = vec2set(vec![Genename("G1".to_string())]);
let ec2: HashSet<Genename> = vec2set(vec![Genename("G2".to_string())]);
let ec3: HashSet<Genename> =
vec2set(vec![Genename("G1".to_string()), Genename("G2".to_string())]);
let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
(EC(0), ec0.clone()),
(EC(1), ec1.clone()),
(EC(2), ec2.clone()),
(EC(3), ec3.clone()),
]);
let es = Ec2GeneMapper::new(ec_dict);
let r1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let r4 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 2, FLAG: 0 };
let r5 = BusRecord { CB: 0, UMI: 2, EC: 0, COUNT: 2, FLAG: 0 };
let r6 = BusRecord { CB: 0, UMI: 2, EC: 3, COUNT: 2, FLAG: 0 };
let r7 = BusRecord { CB: 0, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
let r8 = BusRecord { CB: 0, UMI: 3, EC: 1, COUNT: 2, FLAG: 0 };
let r9 = BusRecord { CB: 0, UMI: 3, EC: 2, COUNT: 2, FLAG: 0 };
let r10 = BusRecord { CB: 0, UMI: 5, EC: 1, COUNT: 2, FLAG: 0 };
let r11 = BusRecord { CB: 0, UMI: 5, EC: 0, COUNT: 2, FLAG: 0 };
let r12 = BusRecord { CB: 0, UMI: 4, EC: 2, COUNT: 2, FLAG: 0 };
let r13 = BusRecord { CB: 0, UMI: 4, EC: 0, COUNT: 2, FLAG: 0 };
let records0 = vec![r1.clone(), r2.clone()];
let c0 = records_to_expression_vector(records0, &es, false);
assert_eq!(c0, HashMap::from([(Genename("G1".to_string()), 1)]));
let records1 = vec![r1.clone(), r2.clone(), r10.clone(), r11.clone()];
let c1 = records_to_expression_vector(records1, &es, false);
assert_eq!(c1, HashMap::from([(Genename("G1".to_string()), 2)]));
let records2 = vec![r4.clone(), r5.clone(), r6.clone()];
let c2 = records_to_expression_vector(records2, &es, false);
assert_eq!(c2, HashMap::from([]));
let records3 = vec![r1, r2, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13];
let c3 = records_to_expression_vector(records3, &es, false);
assert_eq!(
c3,
HashMap::from([
(Genename("G1".to_string()), 2),
(Genename("G2".to_string()), 1)
])
);
}
#[test]
fn test_count() {
let ec0: HashSet<Genename> =
vec2set(vec![Genename("G1".to_string()), Genename("G2".to_string())]);
let ec1: HashSet<Genename> = vec2set(vec![Genename("G1".to_string())]);
let ec2: HashSet<Genename> = vec2set(vec![Genename("G2".to_string())]);
let ec3: HashSet<Genename> =
vec2set(vec![Genename("G1".to_string()), Genename("G2".to_string())]);
let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
(EC(0), ec0.clone()),
(EC(1), ec1.clone()),
(EC(2), ec2.clone()),
(EC(3), ec3.clone()),
]);
let es = Ec2GeneMapper::new(ec_dict);
let r1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 12, FLAG: 0 };
let r2 = BusRecord { CB: 0, UMI: 1, EC: 1, COUNT: 2, FLAG: 0 };
let r3 = BusRecord { CB: 0, UMI: 5, EC: 1, COUNT: 2, FLAG: 0 };
let r4 = BusRecord { CB: 0, UMI: 5, EC: 0, COUNT: 2, FLAG: 0 };
let r5 = BusRecord { CB: 1, UMI: 4, EC: 2, COUNT: 2, FLAG: 0 };
let r6 = BusRecord { CB: 1, UMI: 5, EC: 3, COUNT: 2, FLAG: 0 };
let records = vec![r1, r2, r3, r4, r5, r6];
let (_bname, _dir) = setup_busfile(&records);
let bfolder = BusFolder {
foldername: _dir.path().to_str().unwrap().to_owned(),
};
let mapping_mode = MappingMode::Gene(es, InconsistentResolution::IgnoreInconsistent);
let cmat = count(&bfolder, mapping_mode, false);
let exp: HashMap<_, _> = vec![((CB(0), GeneId(0)), 2), ((CB(1), GeneId(1)), 1)]
.into_iter()
.collect();
let exp_cmat = countmap_to_matrix(
&exp,
vec![Genename("G1".to_string()), Genename("G2".to_string())],
);
assert_eq!(cmat, exp_cmat);
}
}