1use crate::disjoint::DisjointSubsets;
2use crate::utils::ec_mapper_dict_from_busfolders;
3use crate::binomialreg::phantom_binomial_regression;
4use bustools_core::consistent_genes::Ec2GeneMapper;
5use bustools_core::io::{BusFolder, BusRecord};
6use bustools_core::merger::MultiIterator;
7use bustools_core::utils::get_spinner;
8use bustools_core::{
9 consistent_genes::{GeneId, EC},
10 iterators::CbUmiGroupIterator,
11};
12use itertools::izip;
13use polars::io::SerWriter;
14use polars::prelude::Column;
15use serde::{Deserialize, Serialize};
16use std::{
17 collections::{HashMap, HashSet},
18 fs::File,
19 hash::Hash,
20 io::{BufRead, BufReader, Write},
21 time::Instant,
22};
23
24#[derive(Debug)]
29pub struct FingerprintHistogram {
30 pub(crate) samplenames: Vec<String>,
32 pub(crate) histogram: HashMap<Vec<u32>, usize>,
34}
35
36#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Clone, Debug, Copy, Deserialize, Serialize)]
37pub struct AmpFactor(u32);
38
39impl FingerprintHistogram {
40
41 pub fn get_samplenames(&self) -> Vec<String> {
42 self.samplenames.clone()
43 }
44
45 pub fn get_histogram(&self) -> HashMap<Vec<u32>, usize> {
46 self.histogram.clone()
47 }
48
49 pub fn from_csv(filename: &str) -> Self {
51 let fh = File::open(filename).unwrap();
52 let mut samplenames: Vec<String> = Vec::new();
53 let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
54 for (i, line) in BufReader::new(fh).lines().flatten().enumerate() {
55 if i == 0 {
56 for s in line.split(',') {
57 samplenames.push(s.to_string());
58 }
59 let fstring = samplenames.pop().unwrap();
61 assert_eq!(fstring, "frequency");
62 } else {
63 let mut fingerprint: Vec<u32> =
64 line.split(',').map(|e| e.parse::<u32>().unwrap()).collect();
65 let freq = fingerprint.pop().unwrap();
67 histogram.insert(fingerprint, freq as usize);
68 }
69 }
70 FingerprintHistogram {
71 samplenames,
72 histogram,
73 }
74 }
75
76 pub fn new(sample_order: &[String]) -> Self {
77 let hist = HashMap::new();
78 FingerprintHistogram {
79 samplenames: sample_order.to_owned(), histogram: hist,
81 }
82 }
83
84 pub fn add(
86 &mut self,
87 record_dict: HashMap<String, Vec<BusRecord>>,
88 ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
89 ) {
90 let grouped_record_dicts = groupby_gene_across_samples(&record_dict, ecmapper_dict);
93 let mut fingerprints: Vec<Vec<u32>> = Vec::new();
94 for rd in grouped_record_dicts {
95 let fp_hash = make_fingerprint_simple(&rd);
96 let fp: Vec<_> = self
98 .samplenames
99 .iter()
100 .map(|s| fp_hash.get(s).unwrap_or(&0))
101 .cloned()
102 .collect();
103 fingerprints.push(fp);
104 }
105
106 for fp in fingerprints {
107 let v = self.histogram.entry(fp).or_insert(0);
109 *v += 1;
110 }
111 }
112
113 pub fn to_dataframe(&self) -> polars::frame::DataFrame {
115 let mut columns:HashMap<String, Vec<u32>> = HashMap::new();
118
119 for (fingerprint, freq) in self.histogram.iter() {
121 assert_eq!(fingerprint.len(), self.samplenames.len());
122 for (s, fp) in izip!(&self.samplenames, fingerprint) {
123 let v = columns.entry(s.to_string()).or_default();
124 v.push(*fp)
125 }
126 let v= columns.entry("frequency".to_owned()).or_default();
127 v.push(*freq as u32)
128 }
129
130 let mut series = Vec::new();
131 for (name, values) in columns {
132 series.push(Column::new(name.into(), values))
133 }
134 polars::frame::DataFrame::new(series).unwrap()
135 }
136
137 pub fn to_csv(&self, outfile: &str) {
139 let mut fh = File::create(outfile).expect("Cant create file: {outfile}");
140 let mut header = self.samplenames.join(",");
141 header.push_str(",frequency");
142 writeln!(fh, "{}", header).unwrap();
143
144 for (fingerprint, freq) in self.histogram.iter() {
145 let mut s = fingerprint
147 .iter()
148 .map(|i| i.to_string())
149 .collect::<Vec<String>>()
150 .join(",");
151 s.push_str(&format!(",{}", freq));
152 writeln!(fh, "{}", s).unwrap();
153 }
154 }
155
156 pub fn to_csv2(&self, outfile: &str) {
158 let mut df = self.to_dataframe();
159 let mut file = std::fs::File::create(outfile).unwrap();
160 polars::prelude::CsvWriter::new(&mut file).finish(&mut df).unwrap();
161 }
162
163 fn get_z_r(&self) -> HashMap<AmpFactor, usize>{
165 let mut z_r: HashMap<AmpFactor, usize> = HashMap::new();
166
167 for (fingerprint, freq) in self.histogram.iter() {
168 let r: AmpFactor = AmpFactor(fingerprint.iter().sum());
169 let n_experiments = fingerprint.iter().filter(|x| **x > 0).count();
170 if n_experiments == 1 {
171 let v = z_r.entry(r).or_insert(0);
172 *v += freq;
173 }
174 }
175 z_r
176 }
177
178 fn get_m_r(&self) -> HashMap<AmpFactor, usize>{
180
181 let mut m_r: HashMap<AmpFactor, usize> = HashMap::new();
182 for (fingerprint, freq) in self.histogram.iter() {
183 let r: AmpFactor = AmpFactor(fingerprint.iter().sum());
184 let v = m_r.entry(r).or_insert(0);
185 *v += freq;
186 }
187 m_r
188 }
189
190 pub fn estimate_sihr(&self) -> f64 {
195 let z_r = self.get_z_r();
198 let m_r = self.get_m_r();
199
200 let mut r: Vec<AmpFactor> = m_r.keys().map(|k| k.to_owned()).collect();
201 r.sort();
202
203 let z: Vec<usize> = r.iter().map(|x| *z_r.get(x).unwrap_or(&0)).collect(); let m: Vec<usize> = r.iter().map(|x| *m_r.get(x).unwrap()).collect();
205
206 let r_usize: Vec<_> = r.iter().map(|x| x.0 as usize).collect(); let (pmax, _prange, _loglike_range) =
208 phantom_binomial_regression(&z, &m, &r_usize, self.samplenames.len());
209
210 pmax
216 }
217}
218
219pub fn groupby_gene_across_samples(
220 record_dict: &HashMap<String, Vec<BusRecord>>,
221 ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
222) -> Vec<HashMap<String, BusRecord>> {
223 if record_dict.len() == 1 {
230 };
232
233 let mut big_hash: HashMap<String, (&BusRecord, String, &HashSet<GeneId>)> =
236 HashMap::with_capacity(record_dict.len());
237 let mut id_counter: i32 = 0;
238 for (sname, records) in record_dict.iter() {
239 let ecmapper = ecmapper_dict.get(sname).unwrap();
240 for r in records {
241 let g = ecmapper.get_genes(EC(r.EC));
242 let id_str = id_counter.to_string();
243 big_hash.insert(id_str, (r, sname.clone(), g));
244 id_counter += 1;
245 }
246 }
247
248 let mut disjoint_set = DisjointSubsets::new();
250 for (id, (_r, _sname, gset)) in big_hash.iter() {
251 disjoint_set.add(id.clone(), (*gset).clone());
252 }
253
254 let mut emit_vector: Vec<HashMap<String, BusRecord>> = Vec::new();
256 for (ids_of_set_elements, _geneset) in disjoint_set.get_disjoint_set_ids_and_set() {
257 let mut sample_grouped: HashMap<String, Vec<BusRecord>> = HashMap::new();
267 for el_id in ids_of_set_elements {
268 let (record, samplename, _genes) = big_hash.remove(&el_id).unwrap();
270
271 let rlist = sample_grouped.entry(samplename).or_default();
272 rlist.push(record.clone());
273 }
274
275 let mut sample_grouped_aggr: HashMap<String, BusRecord> = HashMap::new();
276 for (s, recordlist) in sample_grouped.iter() {
277 let freq = recordlist.iter().map(|r| r.COUNT).sum();
278 let mut rnew = recordlist.first().unwrap().clone();
279 rnew.COUNT = freq;
280
281 sample_grouped_aggr.insert(s.to_string(), rnew);
287 }
288 emit_vector.push(sample_grouped_aggr);
289 }
290 emit_vector
291}
292
293
294
295pub fn make_fingerprint_histogram(busfolders: HashMap<String, BusFolder>, t2g_file: &str) -> FingerprintHistogram {
299
300 println!("Making EC mappers");
305 let ec_tmp = ec_mapper_dict_from_busfolders(&busfolders, t2g_file );
306 let ecmapper_dict: HashMap<String, &Ec2GeneMapper> = ec_tmp.iter().map(|(name, d)| (name.clone(), d )).collect();
307
308 let now = Instant::now();
309 let result = _make_fingerprint_histogram(&busfolders, &ecmapper_dict);
310 let elapsed_time = now.elapsed();
311 println!(
312 "Ran _make_fingerprint_histogram, took {} seconds.",
313 elapsed_time.as_secs()
314 );
315 result
316}
317
318fn _make_fingerprint_histogram(
319 busfolders: &HashMap<String, BusFolder>,
320 ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
321) -> FingerprintHistogram {
322
323 let iterators = busfolders.iter().map(|(k,v)| (k.clone(), v.get_iterator().groupby_cbumi())).collect();
325 let multi_iter = MultiIterator::new(iterators);
326
327 let bar = get_spinner();
328
329 let mut order: Vec<_> = busfolders.keys().cloned().collect();
330 order.sort();
331
332 let mut fp_histo = FingerprintHistogram::new(&order);
333
334 for (i, ((_cb, _umi), record_dict)) in multi_iter.enumerate() {
335 fp_histo.add(record_dict, ecmapper_dict);
336
337 if i % 1000000 == 0 {
338 bar.inc(1000000);
339 }
340 }
341 fp_histo
342}
343
344pub type Fingerprint = HashMap<String, u32>;
345
346pub fn make_fingerprint_simple(record_dict: &HashMap<String, BusRecord>) -> Fingerprint {
348 let fingerprint: Fingerprint = record_dict
351 .iter()
352 .map(|(k, v)| (k.clone(), v.COUNT))
353 .collect();
354 fingerprint
355}
356
357#[cfg(test)]
358pub mod tests {
359 use bustools_core::{
360 consistent_genes::{Ec2GeneMapper, Genename},
361 io::{BusFolder, BusRecord},
362 };
363 use polars::io::SerWriter;
364 use statrs::assert_almost_eq;
365 use std::collections::HashMap;
366 use super::*;
367
368 fn create_dummy_ec() -> Ec2GeneMapper {
369 let ec0: HashSet<Genename> = vec![Genename("A".to_string())].into_iter().collect();
370 let ec1: HashSet<Genename> = vec![Genename("B".to_string())].into_iter().collect();
371 let ec2: HashSet<Genename> = vec![Genename("A".to_string()), Genename("B".to_string())].into_iter().collect();
372 let ec3: HashSet<Genename> = vec![Genename("C".to_string()), Genename("D".to_string())].into_iter().collect();
373
374 let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
375 (EC(0), ec0), (EC(1), ec1), (EC(2), ec2), (EC(3), ec3), ]);
380 Ec2GeneMapper::new(ec_dict)
381 }
382
383 #[test]
384 fn test_groupby_gene_across_samples() {
385 let es1 = create_dummy_ec();
386 let es2 = create_dummy_ec();
387 let es_dict: HashMap<String, &Ec2GeneMapper> =
388 vec![("s1".to_string(), &es1), ("s2".to_string(), &es2)]
389 .into_iter()
390 .collect();
391
392 let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
394 let r2 =BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0};
395
396 let s1 = BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 3, FLAG: 0};
398 let s2 = BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 3, FLAG: 0};
399
400 let record_dict = vec![
401 ("s1".to_string(), vec![r1, r2]),
402 ("s2".to_string(), vec![s1, s2]),
403 ]
404 .into_iter()
405 .collect();
406
407 let res = groupby_gene_across_samples(&record_dict, &es_dict);
408
409 assert_eq!(res.len(), 1);
411 let r = res[0].clone();
413 assert_eq!(r.get("s1").unwrap().COUNT, 4);
414 assert_eq!(r.get("s2").unwrap().COUNT, 6);
415
416 let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0}; let r2 =BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0}; let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0}; let s2 = BusRecord{CB: 0, UMI: 1, EC: 1, COUNT: 3, FLAG: 0}; let record_dict = vec![
430 ("s1".to_string(), vec![r1, r2]),
431 ("s2".to_string(), vec![s1, s2]),
432 ]
433 .into_iter()
434 .collect();
435 let res = groupby_gene_across_samples(&record_dict, &es_dict);
436 println!("{:?}", res);
437 }
438
439 #[test]
440 fn test_make_fingerprint_histogram() {
441 use bustools_core::io::setup_busfile;
442
443 let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
445 let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
446
447 let r2 =BusRecord{CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0};
449
450 let s2 = BusRecord{CB: 1, UMI: 2, EC: 1, COUNT: 10, FLAG: 0};
452
453
454 let r3 =BusRecord{CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0};
456 let s3 = BusRecord{CB: 1, UMI: 3, EC: 2, COUNT: 3, FLAG: 0};
457
458 let v1 = vec![r1.clone(), r2.clone(), r3.clone()];
464 let v2 = vec![s1.clone(), s2.clone(), s3.clone()];
465
466 let (busname1, _dir1) = setup_busfile(&v1);
468 let (busname2, _dir2) = setup_busfile(&v2);
469
470 let hashmap = HashMap::from([
471 ("s1".to_string(), BusFolder::new(&busname1)),
472 ("s2".to_string(), BusFolder::new(&busname2)),
473 ]);
474
475 let es1 = create_dummy_ec();
476 let es2 = create_dummy_ec();
477 let es_dict: HashMap<String, &Ec2GeneMapper> =
478 vec![("s1".to_string(), &es1), ("s2".to_string(), &es2)]
479 .into_iter()
480 .collect();
481
482 let s = _make_fingerprint_histogram(&hashmap, &es_dict);
483 println!("{:?}", s);
484
485 let e = *s.histogram.get(&vec![12, 0]).unwrap();
486 assert_eq!(e, 1);
487
488 let e = *s.histogram.get(&vec![0, 10]).unwrap();
489 assert_eq!(e, 1);
490
491 let e = *s.histogram.get(&vec![2, 3]).unwrap();
492 assert_eq!(e, 2);
493
494 s.to_csv("/tmp/finger.csv")
495
496 }
498
499 #[test]
500 fn test_make_fingerprint_simple() {
501
502 let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
503 let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
504
505 let record_dict = vec![("s1".to_string(), r1), ("s2".to_string(), s1)]
506 .into_iter()
507 .collect();
508 let res = make_fingerprint_simple(&record_dict);
509 println!("{:?}", res);
510
511 let exp: HashMap<_, _> = vec![("s1".to_string(), 2), ("s2".to_string(), 3)]
512 .into_iter()
513 .collect();
514 assert_eq!(res, exp);
515 }
516
517 pub fn testing2() {
519 let t2g = "/home/michi/bus_testing/transcripts_to_genes.txt";
520 let b1 = BusFolder::new("/home/michi/bus_testing/bus_output_short/");
521 let b2 = BusFolder::new("/home/michi/bus_testing/bus_output_short/");
522 let hashmap = HashMap::from([("full".to_string(), b1), ("short".to_string(), b2)]);
523 let s = make_fingerprint_histogram(hashmap, t2g);
524
525 s.to_csv("/tmp/testing2.csv");
526 println!("{:?}", s)
527 }
528
529 #[test]
530 pub fn test_csv_read_write() {
531 let order = vec!["s1", "s2"]
532 .into_iter()
533 .map(|x| x.to_string())
534 .collect();
535 let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
536
537 histogram.insert(vec![1, 0], 1);
538 histogram.insert(vec![0, 1], 1);
539 histogram.insert(vec![1, 1], 1);
540
541 let fph = FingerprintHistogram {
542 samplenames: order,
543 histogram,
544 };
545
546 println!("{:?}", fph);
549 let t = FingerprintHistogram::from_csv("/tmp/finger.csv");
550 println!("{:?}", t);
551
552 assert_eq!(fph.histogram, t.histogram);
553
554 let p = t.estimate_sihr();
555 println!("{}", p);
556 }
557
558 #[test]
559 pub fn test_estimate_sihr() {
560 let order = vec!["s1", "s2"]
561 .into_iter()
562 .map(|x| x.to_string())
563 .collect();
564 let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
565
566 histogram.insert(vec![1, 0], 1);
567 histogram.insert(vec![0, 1], 1);
568 histogram.insert(vec![1, 1], 1);
569
570 let fph = FingerprintHistogram {
571 samplenames: order,
572 histogram,
573 };
574
575 let p = fph.estimate_sihr();
576 println!("{}", p);
577
578 assert_almost_eq!(p, 0.5, 0.001);
579 }
580
581 #[test]
582 fn test_sihr_real(){
583 let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
584 let sihr = fph.estimate_sihr();
585 insta::assert_yaml_snapshot!(sihr, @r###"
587 ---
588 0.0016278754068794856
589 "###);
590 }
591
592 #[test]
593 fn test_zr_real(){
594 let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
595 let z_r = fph.get_z_r();
597
598 insta::with_settings!({sort_maps => true}, {
601 insta::assert_yaml_snapshot!(z_r);
602 });
603 }
604
605 #[test]
606 fn test_mr_real(){
607 let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
608 let m_r = fph.get_m_r();
610
611 insta::with_settings!({sort_maps => true}, {
614 insta::assert_yaml_snapshot!(m_r);
615 });
616 }
617
618 #[test]
619 fn test_to_dataframe() {
620 let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
621 let mut df = fph.to_dataframe();
622
623 let mut file = std::fs::File::create("/tmp/path.csv").unwrap();
624 polars::prelude::CsvWriter::new(&mut file).finish(&mut df).unwrap();
625 }
626}