phantompurger_rs/
cb_overlap.rs1use std::{collections::HashMap, fs::File};
2use flate2::{write::GzEncoder, Compression};
3use itertools::Itertools;
4use bustools_core::{io::BusReader, iterators::CellGroupIterator, merger::MultiIterator, utils::get_progressbar};
5use std::{
6 hash::Hash,
7 io::Write,
8};
9use crate::utils::valmap_ref;
10
11#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Clone, Debug, Copy)]
12pub struct CB(u64);
13
14pub fn detect_cell_overlap(busfolders: &HashMap<String, String>, outfile: &str) {
18
19 if !outfile.ends_with(".csv") && !outfile.ends_with(".csv.gz") {
20 panic!("unknwon file extension. must be either .csv or .csv.gz")
21 }
22
23 let cbs_per_file = valmap_ref(
25 |busfile| {
26 println!("determine size of iterator {busfile}");
27 BusReader::new(busfile).groupby_cb().count()
28 },
29 busfolders,
30 );
31
32 let cb_iterators = valmap_ref(
33 |busfile| {
34 BusReader::new(busfile).groupby_cb()
35 },
36 busfolders,
37 );
38
39 println!("total records {:?}", cbs_per_file);
40 let total: usize = cbs_per_file.values().sum();
41
42 let samplenames: Vec<String> = busfolders.keys().cloned().collect();
43 let multi_iter = MultiIterator::new(cb_iterators);
44 let mut result: HashMap<CB, Vec<usize>> = HashMap::new();
45
46 let bar = get_progressbar(total as u64);
47
48 for (i, (c, record_dict)) in multi_iter.enumerate() {
49 let mut entry: Vec<usize> = Vec::new();
50 for s in samplenames.iter() {
51 let numi = match record_dict.get(s) {
52 Some(records) => records.iter().map(|r| r.UMI).unique().count(),
53 None => 0,
54 };
55 entry.push(numi)
56 }
57 result.insert(CB(c), entry);
58
59 if i % 100_000 == 0 {
60 bar.inc(100_000);
62 }
63 }
64
65 fn results_to_writer<W:Write>(mut writer: W, result: HashMap<CB, Vec<usize>>, samplenames: Vec<String>) {
66 let mut header = samplenames.join(",");
67 header.push_str(",CB");
68 writeln!(writer, "{}", header).unwrap();
69
70 for (cid, numis) in result.iter() {
71 let mut s = numis
73 .iter()
74 .map(|i| i.to_string())
75 .collect::<Vec<String>>()
76 .join(",");
77 s.push_str(&format!(",{}", cid.0));
78 writeln!(writer, "{}", s).unwrap();
79 }
80 }
81
82 if outfile.ends_with(".csv") {
83 let fh = File::create(outfile).unwrap();
86 results_to_writer(fh, result, samplenames);
87 } else if outfile.ends_with(".csv.gz") {
88 let fh = File::create(outfile).unwrap();
90 let e = GzEncoder::new(fh, Compression::default());
91 results_to_writer(e, result, samplenames);
92 } else {
93 panic!("unknwon file extension. must be either .csv or .csv.gz")
94 }
95}
96
97
98#[cfg(test)]
99mod test {
100 use bustools_core::io::{setup_busfile, BusRecord};
101 use super::*;
102
103 #[test]
104 fn test_detect_overlap() {
105 let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
106 let r2 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
107 let (fname1, _d1) = setup_busfile(&vec![r1]);
108 let (fname2, _d2) = setup_busfile(&vec![r2]);
109 let busfolders = HashMap::from([
110 ("s1".to_string(), fname1.clone()),
111 ("s2".to_string(), fname2.clone()),
112 ]);
113 println!("{}", fname1);
114 detect_cell_overlap(&busfolders, "/tmp/overlap.csv");
115 detect_cell_overlap(&busfolders, "/tmp/overlap.csv.gz");
116 }
117
118 pub fn test_detect_cell_overlap() {
120 let busfolders = HashMap::from([
130 (
131 "full".to_string(),
132 "/home/michi/bus_testing/bus_output/output.corrected.sort.bus".to_string(),
133 ),
134 (
135 "short".to_string(),
136 "/home/michi/bus_testing/bus_output_short/output.corrected.sort.bus".to_string(),
137 ),
138 ]);
139
140 detect_cell_overlap(&busfolders, "/tmp/test_detect_cell_overlap.csv")
141 }
142}