use crate::disjoint::DisjointSubsets;
use crate::utils::ec_mapper_dict_from_busfolders;
use crate::binomialreg::phantom_binomial_regression;
use bustools_core::consistent_genes::Ec2GeneMapper;
use bustools_core::io::{BusFolder, BusRecord};
use bustools_core::merger::MultiIterator;
use bustools_core::utils::get_spinner;
use bustools_core::{
consistent_genes::{GeneId, EC},
iterators::CbUmiGroupIterator,
};
use itertools::izip;
use polars::io::SerWriter;
use polars::prelude::Column;
use serde::{Deserialize, Serialize};
use std::{
collections::{HashMap, HashSet},
fs::File,
hash::Hash,
io::{BufRead, BufReader, Write},
time::Instant,
};
#[derive(Debug)]
pub struct FingerprintHistogram {
pub(crate) samplenames: Vec<String>,
pub(crate) histogram: HashMap<Vec<u32>, usize>,
}
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Clone, Debug, Copy, Deserialize, Serialize)]
pub struct AmpFactor(u32);
impl FingerprintHistogram {
pub fn get_samplenames(&self) -> Vec<String> {
self.samplenames.clone()
}
pub fn get_histogram(&self) -> HashMap<Vec<u32>, usize> {
self.histogram.clone()
}
pub fn from_csv(filename: &str) -> Self {
let fh = File::open(filename).unwrap();
let mut samplenames: Vec<String> = Vec::new();
let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
for (i, line) in BufReader::new(fh).lines().flatten().enumerate() {
if i == 0 {
for s in line.split(',') {
samplenames.push(s.to_string());
}
let fstring = samplenames.pop().unwrap();
assert_eq!(fstring, "frequency");
} else {
let mut fingerprint: Vec<u32> =
line.split(',').map(|e| e.parse::<u32>().unwrap()).collect();
let freq = fingerprint.pop().unwrap();
histogram.insert(fingerprint, freq as usize);
}
}
FingerprintHistogram {
samplenames,
histogram,
}
}
pub fn new(sample_order: &[String]) -> Self {
let hist = HashMap::new();
FingerprintHistogram {
samplenames: sample_order.to_owned(), histogram: hist,
}
}
pub fn add(
&mut self,
record_dict: HashMap<String, Vec<BusRecord>>,
ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
) {
let grouped_record_dicts = groupby_gene_across_samples(&record_dict, ecmapper_dict);
let mut fingerprints: Vec<Vec<u32>> = Vec::new();
for rd in grouped_record_dicts {
let fp_hash = make_fingerprint_simple(&rd);
let fp: Vec<_> = self
.samplenames
.iter()
.map(|s| fp_hash.get(s).unwrap_or(&0))
.cloned()
.collect();
fingerprints.push(fp);
}
for fp in fingerprints {
let v = self.histogram.entry(fp).or_insert(0);
*v += 1;
}
}
pub fn to_dataframe(&self) -> polars::frame::DataFrame {
let mut columns:HashMap<String, Vec<u32>> = HashMap::new();
for (fingerprint, freq) in self.histogram.iter() {
assert_eq!(fingerprint.len(), self.samplenames.len());
for (s, fp) in izip!(&self.samplenames, fingerprint) {
let v = columns.entry(s.to_string()).or_default();
v.push(*fp)
}
let v= columns.entry("frequency".to_owned()).or_default();
v.push(*freq as u32)
}
let mut series = Vec::new();
for (name, values) in columns {
series.push(Column::new(name.into(), values))
}
polars::frame::DataFrame::new(series).unwrap()
}
pub fn to_csv(&self, outfile: &str) {
let mut fh = File::create(outfile).expect("Cant create file: {outfile}");
let mut header = self.samplenames.join(",");
header.push_str(",frequency");
writeln!(fh, "{}", header).unwrap();
for (fingerprint, freq) in self.histogram.iter() {
let mut s = fingerprint
.iter()
.map(|i| i.to_string())
.collect::<Vec<String>>()
.join(",");
s.push_str(&format!(",{}", freq));
writeln!(fh, "{}", s).unwrap();
}
}
pub fn to_csv2(&self, outfile: &str) {
let mut df = self.to_dataframe();
let mut file = std::fs::File::create(outfile).unwrap();
polars::prelude::CsvWriter::new(&mut file).finish(&mut df).unwrap();
}
fn get_z_r(&self) -> HashMap<AmpFactor, usize>{
let mut z_r: HashMap<AmpFactor, usize> = HashMap::new();
for (fingerprint, freq) in self.histogram.iter() {
let r: AmpFactor = AmpFactor(fingerprint.iter().sum());
let n_experiments = fingerprint.iter().filter(|x| **x > 0).count();
if n_experiments == 1 {
let v = z_r.entry(r).or_insert(0);
*v += freq;
}
}
z_r
}
fn get_m_r(&self) -> HashMap<AmpFactor, usize>{
let mut m_r: HashMap<AmpFactor, usize> = HashMap::new();
for (fingerprint, freq) in self.histogram.iter() {
let r: AmpFactor = AmpFactor(fingerprint.iter().sum());
let v = m_r.entry(r).or_insert(0);
*v += freq;
}
m_r
}
pub fn estimate_sihr(&self) -> f64 {
let z_r = self.get_z_r();
let m_r = self.get_m_r();
let mut r: Vec<AmpFactor> = m_r.keys().map(|k| k.to_owned()).collect();
r.sort();
let z: Vec<usize> = r.iter().map(|x| *z_r.get(x).unwrap_or(&0)).collect(); let m: Vec<usize> = r.iter().map(|x| *m_r.get(x).unwrap()).collect();
let r_usize: Vec<_> = r.iter().map(|x| x.0 as usize).collect(); let (pmax, _prange, _loglike_range) =
phantom_binomial_regression(&z, &m, &r_usize, self.samplenames.len());
pmax
}
}
pub fn groupby_gene_across_samples(
record_dict: &HashMap<String, Vec<BusRecord>>,
ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
) -> Vec<HashMap<String, BusRecord>> {
if record_dict.len() == 1 {
};
let mut big_hash: HashMap<String, (&BusRecord, String, &HashSet<GeneId>)> =
HashMap::with_capacity(record_dict.len());
let mut id_counter: i32 = 0;
for (sname, records) in record_dict.iter() {
let ecmapper = ecmapper_dict.get(sname).unwrap();
for r in records {
let g = ecmapper.get_genes(EC(r.EC));
let id_str = id_counter.to_string();
big_hash.insert(id_str, (r, sname.clone(), g));
id_counter += 1;
}
}
let mut disjoint_set = DisjointSubsets::new();
for (id, (_r, _sname, gset)) in big_hash.iter() {
disjoint_set.add(id.clone(), (*gset).clone());
}
let mut emit_vector: Vec<HashMap<String, BusRecord>> = Vec::new();
for (ids_of_set_elements, _geneset) in disjoint_set.get_disjoint_set_ids_and_set() {
let mut sample_grouped: HashMap<String, Vec<BusRecord>> = HashMap::new();
for el_id in ids_of_set_elements {
let (record, samplename, _genes) = big_hash.remove(&el_id).unwrap();
let rlist = sample_grouped.entry(samplename).or_default();
rlist.push(record.clone());
}
let mut sample_grouped_aggr: HashMap<String, BusRecord> = HashMap::new();
for (s, recordlist) in sample_grouped.iter() {
let freq = recordlist.iter().map(|r| r.COUNT).sum();
let mut rnew = recordlist.first().unwrap().clone();
rnew.COUNT = freq;
sample_grouped_aggr.insert(s.to_string(), rnew);
}
emit_vector.push(sample_grouped_aggr);
}
emit_vector
}
pub fn make_fingerprint_histogram(busfolders: HashMap<String, BusFolder>, t2g_file: &str) -> FingerprintHistogram {
println!("Making EC mappers");
let ec_tmp = ec_mapper_dict_from_busfolders(&busfolders, t2g_file );
let ecmapper_dict: HashMap<String, &Ec2GeneMapper> = ec_tmp.iter().map(|(name, d)| (name.clone(), d )).collect();
let now = Instant::now();
let result = _make_fingerprint_histogram(&busfolders, &ecmapper_dict);
let elapsed_time = now.elapsed();
println!(
"Ran _make_fingerprint_histogram, took {} seconds.",
elapsed_time.as_secs()
);
result
}
fn _make_fingerprint_histogram(
busfolders: &HashMap<String, BusFolder>,
ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
) -> FingerprintHistogram {
let iterators = busfolders.iter().map(|(k,v)| (k.clone(), v.get_iterator().groupby_cbumi())).collect();
let multi_iter = MultiIterator::new(iterators);
let bar = get_spinner();
let mut order: Vec<_> = busfolders.keys().cloned().collect();
order.sort();
let mut fp_histo = FingerprintHistogram::new(&order);
for (i, ((_cb, _umi), record_dict)) in multi_iter.enumerate() {
fp_histo.add(record_dict, ecmapper_dict);
if i % 1000000 == 0 {
bar.inc(1000000);
}
}
fp_histo
}
pub type Fingerprint = HashMap<String, u32>;
pub fn make_fingerprint_simple(record_dict: &HashMap<String, BusRecord>) -> Fingerprint {
let fingerprint: Fingerprint = record_dict
.iter()
.map(|(k, v)| (k.clone(), v.COUNT))
.collect();
fingerprint
}
#[cfg(test)]
pub mod tests {
use bustools_core::{
consistent_genes::{Ec2GeneMapper, Genename},
io::{BusFolder, BusRecord},
};
use polars::io::SerWriter;
use statrs::assert_almost_eq;
use std::collections::HashMap;
use super::*;
fn create_dummy_ec() -> Ec2GeneMapper {
let ec0: HashSet<Genename> = vec![Genename("A".to_string())].into_iter().collect();
let ec1: HashSet<Genename> = vec![Genename("B".to_string())].into_iter().collect();
let ec2: HashSet<Genename> = vec![Genename("A".to_string()), Genename("B".to_string())].into_iter().collect();
let ec3: HashSet<Genename> = vec![Genename("C".to_string()), Genename("D".to_string())].into_iter().collect();
let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
(EC(0), ec0), (EC(1), ec1), (EC(2), ec2), (EC(3), ec3), ]);
Ec2GeneMapper::new(ec_dict)
}
#[test]
fn test_groupby_gene_across_samples() {
let es1 = create_dummy_ec();
let es2 = create_dummy_ec();
let es_dict: HashMap<String, &Ec2GeneMapper> =
vec![("s1".to_string(), &es1), ("s2".to_string(), &es2)]
.into_iter()
.collect();
let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
let r2 =BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0};
let s1 = BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 3, FLAG: 0};
let s2 = BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 3, FLAG: 0};
let record_dict = vec![
("s1".to_string(), vec![r1, r2]),
("s2".to_string(), vec![s1, s2]),
]
.into_iter()
.collect();
let res = groupby_gene_across_samples(&record_dict, &es_dict);
assert_eq!(res.len(), 1);
let r = res[0].clone();
assert_eq!(r.get("s1").unwrap().COUNT, 4);
assert_eq!(r.get("s2").unwrap().COUNT, 6);
let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0}; let r2 =BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0};
let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0}; let s2 = BusRecord{CB: 0, UMI: 1, EC: 1, COUNT: 3, FLAG: 0};
let record_dict = vec![
("s1".to_string(), vec![r1, r2]),
("s2".to_string(), vec![s1, s2]),
]
.into_iter()
.collect();
let res = groupby_gene_across_samples(&record_dict, &es_dict);
println!("{:?}", res);
}
#[test]
fn test_make_fingerprint_histogram() {
use bustools_core::io::setup_busfile;
let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
let r2 =BusRecord{CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0};
let s2 = BusRecord{CB: 1, UMI: 2, EC: 1, COUNT: 10, FLAG: 0};
let r3 =BusRecord{CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0};
let s3 = BusRecord{CB: 1, UMI: 3, EC: 2, COUNT: 3, FLAG: 0};
let v1 = vec![r1.clone(), r2.clone(), r3.clone()];
let v2 = vec![s1.clone(), s2.clone(), s3.clone()];
let (busname1, _dir1) = setup_busfile(&v1);
let (busname2, _dir2) = setup_busfile(&v2);
let hashmap = HashMap::from([
("s1".to_string(), BusFolder::new(&busname1)),
("s2".to_string(), BusFolder::new(&busname2)),
]);
let es1 = create_dummy_ec();
let es2 = create_dummy_ec();
let es_dict: HashMap<String, &Ec2GeneMapper> =
vec![("s1".to_string(), &es1), ("s2".to_string(), &es2)]
.into_iter()
.collect();
let s = _make_fingerprint_histogram(&hashmap, &es_dict);
println!("{:?}", s);
let e = *s.histogram.get(&vec![12, 0]).unwrap();
assert_eq!(e, 1);
let e = *s.histogram.get(&vec![0, 10]).unwrap();
assert_eq!(e, 1);
let e = *s.histogram.get(&vec![2, 3]).unwrap();
assert_eq!(e, 2);
s.to_csv("/tmp/finger.csv")
}
#[test]
fn test_make_fingerprint_simple() {
let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
let record_dict = vec![("s1".to_string(), r1), ("s2".to_string(), s1)]
.into_iter()
.collect();
let res = make_fingerprint_simple(&record_dict);
println!("{:?}", res);
let exp: HashMap<_, _> = vec![("s1".to_string(), 2), ("s2".to_string(), 3)]
.into_iter()
.collect();
assert_eq!(res, exp);
}
pub fn testing2() {
let t2g = "/home/michi/bus_testing/transcripts_to_genes.txt";
let b1 = BusFolder::new("/home/michi/bus_testing/bus_output_short/");
let b2 = BusFolder::new("/home/michi/bus_testing/bus_output_short/");
let hashmap = HashMap::from([("full".to_string(), b1), ("short".to_string(), b2)]);
let s = make_fingerprint_histogram(hashmap, t2g);
s.to_csv("/tmp/testing2.csv");
println!("{:?}", s)
}
#[test]
pub fn test_csv_read_write() {
let order = vec!["s1", "s2"]
.into_iter()
.map(|x| x.to_string())
.collect();
let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
histogram.insert(vec![1, 0], 1);
histogram.insert(vec![0, 1], 1);
histogram.insert(vec![1, 1], 1);
let fph = FingerprintHistogram {
samplenames: order,
histogram,
};
println!("{:?}", fph);
let t = FingerprintHistogram::from_csv("/tmp/finger.csv");
println!("{:?}", t);
assert_eq!(fph.histogram, t.histogram);
let p = t.estimate_sihr();
println!("{}", p);
}
#[test]
pub fn test_estimate_sihr() {
let order = vec!["s1", "s2"]
.into_iter()
.map(|x| x.to_string())
.collect();
let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
histogram.insert(vec![1, 0], 1);
histogram.insert(vec![0, 1], 1);
histogram.insert(vec![1, 1], 1);
let fph = FingerprintHistogram {
samplenames: order,
histogram,
};
let p = fph.estimate_sihr();
println!("{}", p);
assert_almost_eq!(p, 0.5, 0.001);
}
#[test]
fn test_sihr_real(){
let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
let sihr = fph.estimate_sihr();
insta::assert_yaml_snapshot!(sihr, @r###"
---
0.0016278754068794856
"###);
}
#[test]
fn test_zr_real(){
let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
let z_r = fph.get_z_r();
insta::with_settings!({sort_maps => true}, {
insta::assert_yaml_snapshot!(z_r);
});
}
#[test]
fn test_mr_real(){
let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
let m_r = fph.get_m_r();
insta::with_settings!({sort_maps => true}, {
insta::assert_yaml_snapshot!(m_r);
});
}
#[test]
fn test_to_dataframe() {
let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
let mut df = fph.to_dataframe();
let mut file = std::fs::File::create("/tmp/path.csv").unwrap();
polars::prelude::CsvWriter::new(&mut file).finish(&mut df).unwrap();
}
}