bustools_cli/
busmerger.rs

1//! Filtering/Merging busfiles on CB/UMI overlap
2use bustools_core::{
3    io::{BusParams, BusReader, BusWriterPlain}, iterators::CbUmiGroupIterator, merger::MultiIterator
4};
5use std::collections::HashMap;
6
7/// will extract all busrecords that appear in both inputs and write them to the respective outputs
8///
9/// there'll be two output files, each contining the shared reads from the respective input file
10/// ## Parameters:
11/// * busfile1: first input
12/// * busfile2: 2nd input
13/// * outfile1: 1st output: will contain all CB/UMI that also appear in busfile2 (not the records itself (EC,COUNT) can be different from busfile2)
14/// * outfile2: 2st output: will contain all CB/UMI that also appear in busfile1 (not the records itself (EC,COUNT) can be different from busfile2)
15pub fn merge_busfiles_on_overlap(busfile1: &str, busfile2: &str, outfile1: &str, outfile2: &str) {
16    //
17    // let h: HashMap<String, String> = HashMap::from([
18    //     ("f1".to_string(), busfile1.to_string()),
19    //     ("f2".to_string(), busfile2.to_string()),
20    // ]);
21    // let cbumi_merge_iter = CellUmiIteratorMulti::new(&h);
22
23
24    // curently only avaialbale for plain writers
25    // The BusZWriter cannot `write_records` (we need to assert that we correcrlty closed the file)
26    let params = BusParams {cb_len: 16, umi_len: 12};
27    let mut writers: HashMap<String, BusWriterPlain> = HashMap::from([
28        (
29            "f1".to_string(),
30            BusWriterPlain::new(outfile1, params.clone()),
31        ),
32        (
33            "f2".to_string(),
34            BusWriterPlain::new(outfile2, params),
35        ),
36    ]);
37
38
39    let h: HashMap<String, _> = HashMap::from([
40        ("f1".to_string(), BusReader::new(busfile1).groupby_cbumi()),
41        ("f2".to_string(), BusReader::new(busfile2).groupby_cbumi()),
42    ]);
43    let cbumi_merge_iter = MultiIterator::new(h);
44
45    for (_cbumi, record_map) in cbumi_merge_iter {
46        // if the CB/UMI is present in both files, write
47        if record_map.len() == 2 {
48            for (name, records) in record_map {
49                let w1 = writers.get_mut(&name).unwrap();
50                w1.write_records(&records)
51            }
52        }
53    }
54
55    // cbumi_merge_iter.iter().map(|(_cbumi, record_map)| {
56    //     if record_map.len() == 2 {
57    //         let w1 = writers.get_mut(&name).unwrap();
58
59    //     }
60    // })
61
62
63}
64
65#[cfg(test)]
66mod tests {
67    use super::*;
68    use bustools_core::io::{setup_busfile, BusReader, BusRecord};
69
70    fn get_records(fname: &str) -> Vec<BusRecord> {
71        let reader = BusReader::new(fname);
72        let records: Vec<BusRecord> = reader.into_iter().collect();
73        records
74    }
75
76    #[test]
77    fn test_merge() {
78        let r1 = BusRecord { CB: 0, UMI: 21, EC: 0, COUNT: 2, FLAG: 0 };
79        let r2 = BusRecord { CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0 };
80        let r3 = BusRecord { CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
81        let r4 = BusRecord { CB: 3, UMI: 0, EC: 0, COUNT: 2, FLAG: 0 };
82        let r5 = BusRecord { CB: 3, UMI: 0, EC: 1, COUNT: 2, FLAG: 0 };
83
84        let v1 = vec![r1.clone(), r2.clone(), r3.clone(), r4.clone(), r5.clone()];
85
86        let s2 = BusRecord { CB: 1, UMI: 2, EC: 1, COUNT: 12, FLAG: 0 };
87        let s3 = BusRecord { CB: 2, UMI: 3, EC: 1, COUNT: 2, FLAG: 0 };
88        let s4 = BusRecord { CB: 3, UMI: 0, EC: 1, COUNT: 2, FLAG: 0 };
89
90        let v2 = vec![s2.clone(), s3.clone(), s4.clone()];
91
92        // let input1 = "/tmp/merge1.bus";
93        // let input2 = "/tmp/merge2.bus";
94
95        let (input1, _dir1) = setup_busfile(&v1); //input1
96        let (input2, _dir2) = setup_busfile(&v2); // input2
97
98        let output1_path = _dir1.path().join("merge1_out.bus");
99        let output1 = output1_path.to_str().unwrap();
100        let output2_path = _dir2.path().join("merge2_out.bus");
101        let output2 = output2_path.to_str().unwrap();
102
103        merge_busfiles_on_overlap(&input1, &input2, output1, output2);
104
105        assert_eq!(get_records(output1), vec![r2, r4, r5]);
106        assert_eq!(get_records(output2), vec![s2, s4]);
107    }
108}