phantompurger_rs/
cb_overlap.rs

1use std::{collections::HashMap, fs::File};
2use flate2::{write::GzEncoder, Compression};
3use itertools::Itertools;
4use bustools_core::{io::BusReader, iterators::CellGroupIterator, merger::MultiIterator, utils::get_progressbar};
5use std::{
6    hash::Hash,
7    io::Write,
8};
9use crate::utils::valmap_ref;
10
11#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Clone, Debug, Copy)]
12pub struct CB(u64);
13
14/// For each CB calcualte the number of UMIs per experiment/busfile.
15/// If extensive swapping has occured, we'll see shared CBs across experiments
16/// with correlated #umis
17pub fn detect_cell_overlap(busfolders: &HashMap<String, String>, outfile: &str) {
18
19    if !outfile.ends_with(".csv") && !outfile.ends_with(".csv.gz") {
20        panic!("unknwon file extension. must be either .csv or .csv.gz")
21    }
22
23    // figure out size of iterators, just for progress bar!
24    let cbs_per_file = valmap_ref(
25        |busfile| {
26            println!("determine size of iterator {busfile}");
27            BusReader::new(busfile).groupby_cb().count()
28        },
29        busfolders,
30    );
31
32    let cb_iterators = valmap_ref(
33        |busfile| {
34            BusReader::new(busfile).groupby_cb()
35        },
36        busfolders,
37    );
38
39    println!("total records {:?}", cbs_per_file);
40    let total: usize = cbs_per_file.values().sum();
41
42    let samplenames: Vec<String> = busfolders.keys().cloned().collect();
43    let multi_iter = MultiIterator::new(cb_iterators);
44    let mut result: HashMap<CB, Vec<usize>> = HashMap::new();
45
46    let bar = get_progressbar(total as u64);
47
48    for (i, (c, record_dict)) in multi_iter.enumerate() {
49        let mut entry: Vec<usize> = Vec::new();
50        for s in samplenames.iter() {
51            let numi = match record_dict.get(s) {
52                Some(records) => records.iter().map(|r| r.UMI).unique().count(),
53                None => 0,
54            };
55            entry.push(numi)
56        }
57        result.insert(CB(c), entry);
58
59        if i % 100_000 == 0 {
60            // cells iterations are usually rather small, i.e. millions, update more reg
61            bar.inc(100_000);
62        }
63    }
64
65    fn results_to_writer<W:Write>(mut writer: W, result: HashMap<CB, Vec<usize>>, samplenames: Vec<String>) {
66        let mut header = samplenames.join(",");
67        header.push_str(",CB");
68        writeln!(writer, "{}", header).unwrap();
69
70        for (cid, numis) in result.iter() {
71            // concat with commas
72            let mut s = numis
73                .iter()
74                .map(|i| i.to_string())
75                .collect::<Vec<String>>()
76                .join(",");
77            s.push_str(&format!(",{}", cid.0));
78            writeln!(writer, "{}", s).unwrap();
79        }
80    }
81
82    if outfile.ends_with(".csv") {
83        // write to file
84        // TODO could be inlined into the above code to instantly write
85        let fh = File::create(outfile).unwrap();
86        results_to_writer(fh, result, samplenames);
87    } else if outfile.ends_with(".csv.gz") {
88        // write compressed
89        let fh = File::create(outfile).unwrap();
90        let e = GzEncoder::new(fh, Compression::default());
91        results_to_writer(e, result, samplenames);
92    } else {
93        panic!("unknwon file extension. must be either .csv or .csv.gz")
94    }
95}
96
97
98#[cfg(test)]
99mod test {
100    use bustools_core::io::{setup_busfile, BusRecord};
101    use super::*;
102    
103    #[test]
104    fn test_detect_overlap() {
105        let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
106        let r2 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
107        let (fname1, _d1) = setup_busfile(&vec![r1]);
108        let (fname2, _d2) = setup_busfile(&vec![r2]);
109        let busfolders = HashMap::from([
110            ("s1".to_string(), fname1.clone()),
111            ("s2".to_string(), fname2.clone()),
112        ]);
113        println!("{}", fname1);
114        detect_cell_overlap(&busfolders, "/tmp/overlap.csv");
115        detect_cell_overlap(&busfolders, "/tmp/overlap.csv.gz");
116    }
117
118    // #[test]
119    pub fn test_detect_cell_overlap() {
120        // let t2g = "/home/michi/bus_testing/transcripts_to_genes.txt";
121        // let b1 = BusFolder::new("/home/michi/bus_testing/bus_output/", t2g);
122        // let b1 = BusFolder::new("/home/michi/bus_testing/bus_output_short/", t2g);
123        // let b2 = BusFolder::new("/home/michi/bus_testing/bus_output_short/", t2g);
124        // let busfolders = HashMap::from([
125        //     ("full".to_string(), b1),
126        //     ("short".to_string(), b2)
127        // ]);
128
129        let busfolders = HashMap::from([
130            (
131                "full".to_string(),
132                "/home/michi/bus_testing/bus_output/output.corrected.sort.bus".to_string(),
133            ),
134            (
135                "short".to_string(),
136                "/home/michi/bus_testing/bus_output_short/output.corrected.sort.bus".to_string(),
137            ),
138        ]);
139
140        detect_cell_overlap(&busfolders, "/tmp/test_detect_cell_overlap.csv")
141    }
142}