Skip to main content

cyto_ibu_barcode_correct/
lib.rs

1mod stats;
2mod whitelist;
3
4pub use stats::{CorrectStats, FormattedStats};
5pub use whitelist::Whitelist;
6
7use std::path::Path;
8
9use anyhow::Result;
10use cyto_cli::ibu::ArgsBarcode;
11use cyto_io::{match_input, match_output, match_output_stderr};
12use ibu::{Reader, Record, Writer};
13use log::trace;
14
15use crate::whitelist::Correction;
16
17fn write_statistics<P: AsRef<Path>>(path: Option<P>, stats: CorrectStats) -> Result<()> {
18    let mut writer = match_output_stderr(path)?;
19    let format_stats = FormattedStats::new(stats);
20    serde_json::to_writer_pretty(&mut writer, &format_stats)?;
21    writer.flush()?;
22    Ok(())
23}
24
25/// Prebuild whitelist so multiple threads deduplicate work in building mismatch table.
26pub fn run_with_prebuilt_whitelist(args: &ArgsBarcode, mut whitelist: Whitelist) -> Result<()> {
27    // Build IO handles
28    let input = match_input(args.input.input.as_ref())?;
29
30    // Initialize the reader and header
31    let reader = Reader::new(input)?;
32    let header = reader.header();
33
34    // Write the header to the output file
35    let output = match_output(args.options.output.as_ref())?;
36    let mut writer = Writer::new(output, header)?;
37
38    // Process the records sequentially
39    let mut stats = CorrectStats::default();
40    let mut second_pass = Vec::new();
41
42    trace!(
43        "Starting first pass [file: {}, exact_match: {}]",
44        args.input.input.as_deref().unwrap_or("stdin"),
45        args.options.exact
46    );
47    for record in reader {
48        let record = record?;
49        let barcode = record.barcode;
50        stats.total += 1;
51
52        // Case where barcode is in the whitelist without error
53        match whitelist.correct_to(barcode, args.options.exact) {
54            Correction::Ambiguous => {
55                if args.options.skip_second_pass {
56                    stats.ambiguous += 1;
57                    if args.options.include {
58                        writer.write_record(&record)?;
59                    }
60                } else {
61                    second_pass.push(record); // Record is ambiguous - will try to resolve in second pass
62                }
63            }
64            Correction::Unchanged => {
65                stats.matched += 1;
66                stats.unchanged += 1;
67                whitelist.increment(barcode);
68                writer.write_record(&record)?;
69            }
70            Correction::Corrected(corrected) => {
71                stats.matched += 1;
72                stats.corrected += 1;
73                whitelist.increment(corrected);
74                let new_record = Record::new(corrected, record.umi, record.index);
75                writer.write_record(&new_record)?;
76            }
77        }
78    }
79
80    if !second_pass.is_empty() && !args.options.exact {
81        trace!(
82            "Starting second pass (ambiguous subset) [file: {}]...",
83            args.input.input.as_deref().unwrap_or("stdin")
84        );
85        for record in second_pass {
86            match whitelist.ambiguously_correct_to_(record.barcode) {
87                Correction::Ambiguous => {
88                    stats.ambiguous += 1;
89                    // Write ambiguous unless user wants to remove
90                    if args.options.include {
91                        writer.write_record(&record)?;
92                    }
93                }
94                Correction::Unchanged => {
95                    stats.matched += 1;
96                    stats.unchanged += 1;
97                    writer.write_record(&record)?;
98                }
99                Correction::Corrected(corrected) => {
100                    stats.matched += 1;
101                    stats.corrected += 1;
102                    stats.corrected_via_counts += 1;
103                    let new_record = Record::new(corrected, record.umi, record.index);
104                    writer.write_record(&new_record)?;
105                }
106            }
107        }
108    }
109
110    // Flush the output
111    writer.finish()?;
112
113    // Write the statistics to stderr
114    write_statistics(args.options.log.as_ref(), stats)?;
115    Ok(())
116}
117
118pub fn run(args: &ArgsBarcode) -> Result<()> {
119    let whitelist = Whitelist::from_path(&args.options.whitelist)?;
120    run_with_prebuilt_whitelist(args, whitelist)
121}