barcode_count/
info.rs

1use ahash::{AHashSet, HashMap, HashMapExt};
2use anyhow::{anyhow, Context, Result};
3use itertools::Itertools;
4use num_format::{Locale, ToFormattedString};
5use regex::Regex;
6use std::{
7    fmt, fs,
8    sync::{
9        atomic::{AtomicU32, Ordering},
10        Arc,
11    },
12};
13
14// Struct to keep track of sequencing errors and correct matches.  This is displayed at the end of the algorithm for QC measures
15#[derive(Debug, Clone)]
16pub struct SequenceErrors {
17    constant_region: Arc<AtomicU32>, // errors within the constant region
18    sample_barcode: Arc<AtomicU32>,  // errors within the sample barcode
19    barcode: Arc<AtomicU32>,         // erors within the counted barcode
20    matched: Arc<AtomicU32>,         // total matched
21    duplicates: Arc<AtomicU32>,      // total random barcode duplicates
22    low_quality: Arc<AtomicU32>,     // total random barcode duplicates
23}
24
25impl Default for SequenceErrors {
26    fn default() -> Self {
27        Self::new()
28    }
29}
30
31impl SequenceErrors {
32    /// Create a new sequence error struct.  Starts with 0 errors in all regions, then is added to later.
33    ///
34    /// # Example
35    /// ```
36    /// use barcode_count::info::SequenceErrors;
37    ///
38    /// let mut sequence_errors = SequenceErrors::new();
39    /// ```
40    pub fn new() -> Self {
41        SequenceErrors {
42            constant_region: Arc::new(AtomicU32::new(0)),
43            sample_barcode: Arc::new(AtomicU32::new(0)),
44            barcode: Arc::new(AtomicU32::new(0)),
45            matched: Arc::new(AtomicU32::new(0)),
46            duplicates: Arc::new(AtomicU32::new(0)),
47            low_quality: Arc::new(AtomicU32::new(0)),
48        }
49    }
50
51    /// Add one to constant region error
52    ///
53    /// # Example
54    /// ```
55    /// use barcode_count::info::SequenceErrors;
56    ///
57    /// let mut sequence_errors = SequenceErrors::new();
58    /// sequence_errors.constant_region_error();
59    /// ```
60    pub fn constant_region_error(&mut self) {
61        self.constant_region.fetch_add(1, Ordering::Relaxed);
62    }
63
64    /// Add one to sample barcode error
65    ///
66    /// # Example
67    /// ```
68    /// use barcode_count::info::SequenceErrors;
69    ///
70    /// let mut sequence_errors = SequenceErrors::new();
71    /// sequence_errors.sample_barcode_error();
72    /// ```
73    pub fn sample_barcode_error(&mut self) {
74        self.sample_barcode.fetch_add(1, Ordering::Relaxed);
75    }
76
77    /// Add one to barcode error
78    ///
79    /// # Example
80    /// ```
81    /// use barcode_count::info::SequenceErrors;
82    ///
83    /// let mut sequence_errors = SequenceErrors::new();
84    /// sequence_errors.barcode_error();
85    /// ```
86    pub fn barcode_error(&mut self) {
87        self.barcode.fetch_add(1, Ordering::Relaxed);
88    }
89
90    /// Add one to correct match
91    ///
92    /// # Example
93    /// ```
94    /// use barcode_count::info::SequenceErrors;
95    ///
96    /// let mut sequence_errors = SequenceErrors::new();
97    /// sequence_errors.correct_match();
98    /// ```
99    pub fn correct_match(&mut self) {
100        self.matched.fetch_add(1, Ordering::Relaxed);
101    }
102
103    /// Add one to duplicates
104    ///
105    /// # Example
106    /// ```
107    /// use barcode_count::info::SequenceErrors;
108    ///
109    /// let mut sequence_errors = SequenceErrors::new();
110    /// sequence_errors.duplicated();
111    /// ```
112    pub fn duplicated(&mut self) {
113        self.duplicates.fetch_add(1, Ordering::Relaxed);
114    }
115
116    /// Add one to low_quality
117    ///
118    /// # Example
119    /// ```
120    /// use barcode_count::info::SequenceErrors;
121    ///
122    /// let mut sequence_errors = SequenceErrors::new();
123    /// sequence_errors.low_quality_barcode();
124    /// ```
125    pub fn low_quality_barcode(&mut self) {
126        self.low_quality.fetch_add(1, Ordering::Relaxed);
127    }
128
129    pub fn arc_clone(&self) -> SequenceErrors {
130        SequenceErrors {
131            constant_region: Arc::clone(&self.constant_region),
132            sample_barcode: Arc::clone(&self.sample_barcode),
133            barcode: Arc::clone(&self.barcode),
134            matched: Arc::clone(&self.matched),
135            duplicates: Arc::clone(&self.duplicates),
136            low_quality: Arc::clone(&self.low_quality),
137        }
138    }
139}
140
141impl fmt::Display for SequenceErrors {
142    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
143        write!(
144            f,
145            "\
146            Correctly matched sequences: {}\n\
147            Constant region mismatches:  {}\n\
148            Sample barcode mismatches:   {}\n\
149            Counted barcode mismatches:  {}\n\
150            Duplicates:                  {}\n\
151            Low quality barcodes:        {}",
152            self.matched
153                .load(Ordering::Relaxed)
154                .to_formatted_string(&Locale::en),
155            self.constant_region
156                .load(Ordering::Relaxed)
157                .to_formatted_string(&Locale::en),
158            self.sample_barcode
159                .load(Ordering::Relaxed)
160                .to_formatted_string(&Locale::en),
161            self.barcode
162                .load(Ordering::Relaxed)
163                .to_formatted_string(&Locale::en),
164            self.duplicates
165                .load(Ordering::Relaxed)
166                .to_formatted_string(&Locale::en),
167            self.low_quality
168                .load(Ordering::Relaxed)
169                .to_formatted_string(&Locale::en)
170        )
171    }
172}
173
174// Struct to keep the format information for the sequencing, ie barcodes, regex search etc.
175#[derive(Debug, Clone)]
176pub struct SequenceFormat {
177    pub format_string: String,       // sequence with 'N's replacing barcodes
178    pub regions_string: String,      // String with each region contain a code
179    pub length: usize,               // Total length of format sequence
180    pub constant_region_length: u16, // Length of only the consant nucleotides
181    pub format_regex: Regex,         // The regex search used to find barcodes
182    pub barcode_num: usize,          // Number of counted barcodes.  More for DEL
183    pub barcode_lengths: Vec<u16>,   // The length of each counted barcode
184    pub sample_length_option: Option<u16>, // Sample barcode length
185    pub random_barcode: bool,        // Whether a random barcode is included
186    pub sample_barcode: bool,        // Whether a sammple barcode is included
187}
188
189impl SequenceFormat {
190    /// Creates a new empty SequenceFormat struct
191    ///
192    /// # Example
193    /// ```
194    /// use barcode_count::info::SequenceFormat;
195    ///
196    /// let sequence_format = SequenceFormat::new();
197    /// ```
198    pub fn new() -> Result<Self> {
199        let empty_regex = Regex::new("")?;
200        Ok(SequenceFormat {
201            format_string: String::new(),
202            regions_string: String::new(),
203            length: 0,
204            constant_region_length: 0,
205            format_regex: empty_regex,
206            barcode_num: 0,
207            barcode_lengths: Vec::new(),
208            sample_length_option: None,
209            random_barcode: false,
210            sample_barcode: false,
211        })
212    }
213    /// Parses the format file into all fields of the SequenceFormat struct, including the regex
214    /// search, barcode sizes, and sequence format strings.
215    pub fn parse_format_file(format_path: &str) -> Result<Self> {
216        let mut sequence_format = SequenceFormat::new()?;
217        // Read sequence format file to string
218        let format_data = fs::read_to_string(format_path)
219            .context(format!("Failed to open {}", format_path))?
220            .lines() // split into lines
221            .filter(|line| !line.starts_with('#')) // remove any line that starts with '#'
222            .collect::<String>(); // collect into a String
223
224        // Starts the string that is used to create the regex search
225        let mut regex_string = String::new();
226        // Digit search to find the number within any format group
227        let digit_search = Regex::new(r"\d+")?;
228        // Search groups separated by '|' or statements in order to iterate through each group
229        // within the format data from the format file and create the regex search string, along
230        // with add the other needed information.  Uses the {#}, [#], (#), [ATGC], and 'N's as
231        // groups
232        let barcode_search = Regex::new(r"(?i)(\{\d+\})|(\[\d+\])|(\(\d+\))|N+|[ATGC]+")?;
233        for group in barcode_search.find_iter(&format_data) {
234            let group_str = group.as_str();
235            // Holds the capture group name.  Is non-barcode regions
236            let mut group_name_option = None;
237
238            // If the group is a barcode group, add the capture group name, and set barcode
239            // included fields to true
240            if group_str.contains('[') {
241                group_name_option = Some("sample".to_string());
242                sequence_format.sample_barcode = true;
243            } else if group_str.contains('{') {
244                sequence_format.barcode_num += 1;
245                group_name_option = Some(format!("barcode{}", sequence_format.barcode_num));
246            } else if group_str.contains('(') {
247                group_name_option = Some("random".to_string());
248                sequence_format.random_barcode = true;
249            }
250
251            if let Some(group_name) = group_name_option {
252                let digits = digit_search
253                    .captures(group_str)
254                    .unwrap()
255                    .get(0)
256                    .unwrap()
257                    .as_str()
258                    .parse::<u16>()
259                    .unwrap();
260
261                // Create the capture group with the group name for the barcode and add it to the
262                // string created for the regex search
263                let mut capture_group = format!("(?P<{}>.", group_name);
264                capture_group.push('{');
265                capture_group.push_str(&digits.to_string());
266                capture_group.push_str("})");
267                regex_string.push_str(&capture_group);
268
269                // Add lengths of any of the barcodes to the sequence_format struct fields.  Also
270                // set the code for the regions_string
271                let mut push_char = '\0';
272                if group_name == "sample" {
273                    sequence_format.sample_length_option = Some(digits);
274                    push_char = 'S'
275                } else if group_name.contains("barcode") {
276                    sequence_format.barcode_lengths.push(digits);
277                    push_char = 'B'
278                } else if group_name == "random" {
279                    push_char = 'R'
280                }
281                // For the number of nucleotides of the barcode add 'N's to format string and the
282                // push_char just set to regions_string
283                for _ in 0..digits {
284                    sequence_format.regions_string.push(push_char);
285                    sequence_format.format_string.push('N')
286                }
287            } else if group_str.contains('N') {
288                // Used to handle if 'N's are added to the format file.  These will be treated as
289                // 'any' nucleotide for error handling and matching
290                let num_of_ns = group_str.matches('N').count();
291                let mut n_group = "[AGCT]{".to_string();
292                n_group.push_str(&num_of_ns.to_string());
293                n_group.push('}');
294                regex_string.push_str(&n_group);
295                sequence_format.format_string.push_str(group_str);
296            } else {
297                // Any A,G,C, or T is treated as constant region here
298                regex_string.push_str(&group_str.to_uppercase());
299                sequence_format.format_string.push_str(group_str);
300                let constant_group_length = group_str.chars().count();
301                for _ in 0..constant_group_length {
302                    sequence_format.regions_string.push('C');
303                }
304                sequence_format.constant_region_length += constant_group_length as u16;
305            }
306        }
307        sequence_format.length = sequence_format.format_string.chars().count();
308        sequence_format.format_regex = Regex::new(&regex_string)?;
309        Ok(sequence_format)
310    }
311}
312
313impl fmt::Display for SequenceFormat {
314    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
315        let mut key = String::new();
316        let mut new_char = AHashSet::new();
317        for key_char in self.regions_string.chars() {
318            if new_char.insert(key_char) {
319                let key_info = match key_char {
320                    'S' => "\nS: Sample barcode",
321                    'B' => "\nB: Counted barcode",
322                    'C' => "\nC: Constant region",
323                    'R' => "\nR: Random barcode",
324                    _ => "",
325                };
326                key.push_str(key_info);
327            }
328        }
329        write!(
330            f,
331            "-FORMAT-\n{}\n{}{}",
332            self.format_string, self.regions_string, key
333        )
334    }
335}
336
337/// Contains all possible barcode sequences for error handling and barcode to ID conversion
338pub struct BarcodeConversions {
339    pub samples_barcode_hash: HashMap<String, String>,
340    pub sample_seqs: AHashSet<String>,
341    pub counted_barcodes_hash: Vec<HashMap<String, String>>,
342    pub counted_barcode_seqs: Vec<AHashSet<String>>,
343}
344
345impl Default for BarcodeConversions {
346    fn default() -> Self {
347        Self::new()
348    }
349}
350
351impl BarcodeConversions {
352    /// Creates an empty BarcodeConversions struct
353    pub fn new() -> Self {
354        BarcodeConversions {
355            samples_barcode_hash: HashMap::new(),
356            sample_seqs: AHashSet::new(),
357            counted_barcodes_hash: Vec::new(),
358            counted_barcode_seqs: Vec::new(),
359        }
360    }
361
362    /// Reads in comma separated barcode file (CSV).  The columns need to have headers.  The first column needs to be the nucleotide barcode
363    /// and the second needs to be the ID
364    pub fn sample_barcode_file_conversion(&mut self, barcode_path: &str) -> Result<()> {
365        // read in the sample barcode file
366        for (barcode, sample_id) in fs::read_to_string(barcode_path)
367            .context(format!("Failed to open {}", barcode_path))?
368            .lines() // split the lines
369            .skip(1) // skip the first line which should be the header
370            .map(|line| {
371                line.split(',')
372                    .take(2) // take only the first two values, or columns
373                    .map(|value| value.to_string())
374                    .collect_tuple()
375                    .unwrap_or(("".to_string(), "".to_string()))
376            })
377        {
378            self.samples_barcode_hash.insert(barcode, sample_id);
379        }
380        Ok(())
381    }
382
383    /// Reads in comma separated barcode file (CSV).  The columns need to have headers.  The first column needs to be the nucleotide barcode
384    /// the second needs to be the ID, and the third needs to be the barcode index location
385    ///
386    /// # Panics
387    ///
388    /// This panics if the third column of the barcode conversion file does not contain integers.  Also
389    /// panics if not all integers for barcode numbers is within this columns
390    pub fn barcode_file_conversion(
391        &mut self,
392        barcode_path: &str,
393        barcode_num: usize,
394    ) -> Result<()> {
395        // read in the sample barcode file
396        let barcode_vecs = fs::read_to_string(barcode_path)
397            .context(format!("Failed to read {}", barcode_path))?
398            .lines() // split the lines
399            .skip(1) // skip the first line which should be the header
400            .map(|line| {
401                line.split(',')
402                    .take(3) // take only the first three values, or columns
403                    .map(|value| value.to_string())
404                    .collect_tuple()
405                    .unwrap_or(("".to_string(), "".to_string(), "".to_string()))
406            }) // comma split the line into a tuple with the first being the key and the last the value
407            .collect::<Vec<(String, String, String)>>();
408        for _ in 0..barcode_num {
409            self.counted_barcodes_hash.push(HashMap::new());
410        }
411        let mut barcode_num_contained = AHashSet::new();
412        for (barcode, id, barcode_num) in barcode_vecs {
413            let barcode_num_usize = barcode_num.parse::<usize>().context(format!(
414                "Third column of barcode file contains something other than an integer: {}",
415                barcode_num
416            ))? - 1;
417            barcode_num_contained.insert(barcode_num_usize);
418            self.counted_barcodes_hash[barcode_num_usize].insert(barcode, id);
419        }
420        let mut missing_barcode_num = Vec::new();
421        for x in 0..barcode_num {
422            if !barcode_num_contained.contains(&x) {
423                missing_barcode_num.push(x)
424            }
425        }
426        if !missing_barcode_num.is_empty() {
427            return Err(anyhow!(format!(
428                "Barcode conversion file missing barcode numers {:?} in the third column",
429                missing_barcode_num
430            )));
431        }
432        Ok(())
433    }
434    /// Creates a hashmap of all sample barcode sequences in order to compare for sequencing errors
435    pub fn get_sample_seqs(&mut self) {
436        if !self.samples_barcode_hash.is_empty() {
437            for sample_barcode in self.samples_barcode_hash.keys() {
438                self.sample_seqs.insert(sample_barcode.to_string());
439            }
440        }
441    }
442
443    /// Creates a hashmap of all counted barcode sequences in order to compare for sequencing errors
444    pub fn get_barcode_seqs(&mut self) {
445        if !self.counted_barcodes_hash.is_empty() {
446            self.counted_barcode_seqs = self
447                .counted_barcodes_hash
448                .iter()
449                .map(|hash| {
450                    hash.keys()
451                        .map(|key| key.to_string())
452                        .collect::<AHashSet<String>>()
453                }) // creates a hashset for each sequential barcode, then collects into a vector with the index being each sequential counted barcode
454                .collect::<Vec<AHashSet<String>>>();
455        }
456    }
457}
458
459/// Struct of how many sequencing errrors are allowed
460#[derive(Debug, Clone, PartialEq)]
461pub struct MaxSeqErrors {
462    // errors within the constant region
463    constant_region: u16,
464    constant_region_size: u16,
465    // errors within the sample barcode
466    sample_barcode: u16,
467    sample_size: u16,
468    // erors within the counted barcode
469    barcode: Vec<u16>,
470    barcode_sizes: Vec<u16>,
471    min_quality: f32,
472}
473
474impl MaxSeqErrors {
475    /// Create a new sequence error struct
476    ///
477    /// # Example
478    /// ```
479    /// use barcode_count::info::MaxSeqErrors;
480    ///
481    /// let sample_errors_option = None;
482    /// let sample_barcode_size_option = Some(10);
483    /// let barcode_errors_option = None;
484    /// let barcode_sizes = vec![8,8,8];
485    /// let constant_errors_option = None;
486    /// let constant_region_size = 30;
487    /// let min_quality = 0.0;
488    /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
489    /// ```
490    pub fn new(
491        sample_errors_option: Option<u16>,
492        sample_barcode_size_option: Option<u16>,
493        barcode_errors_option: Option<u16>,
494        barcode_sizes: Vec<u16>,
495        constant_errors_option: Option<u16>,
496        constant_region_size: u16,
497        min_quality: f32,
498    ) -> Self {
499        let max_sample_errors;
500        // start with a sample size of 0 in case there is no sample barcode.  If there is then mutate
501        let mut sample_size = 0;
502        // If sample barcode was included, calculate the maximum error, otherwise set error to 0
503        if let Some(sample_size_actual) = sample_barcode_size_option {
504            sample_size = sample_size_actual;
505            // if there was sample errors input from arguments, use that, otherwise calculate 20% for max errors
506            if let Some(sample_errors) = sample_errors_option {
507                max_sample_errors = sample_errors
508            } else {
509                max_sample_errors = sample_size_actual / 5;
510            }
511        } else {
512            max_sample_errors = 0;
513        }
514
515        let mut max_barcode_errors = Vec::new();
516        // If max error was set by input arguments, use that value, otherwise calculate 20% of barcode size for max error
517        for barcode_size in &barcode_sizes {
518            if let Some(barcode_errors) = barcode_errors_option {
519                max_barcode_errors.push(barcode_errors);
520            } else {
521                max_barcode_errors.push(barcode_size / 5);
522            }
523        }
524
525        let max_constant_errors;
526        // If max error was set by input arguments, use that value, otherwise calculate 20% of barcode size for max error
527        if let Some(constant_errors) = constant_errors_option {
528            max_constant_errors = constant_errors
529        } else {
530            max_constant_errors = constant_region_size / 5;
531            // errors allowed is the length of the constant region - the Ns / 5 or 20%
532        }
533
534        MaxSeqErrors {
535            constant_region: max_constant_errors,
536            constant_region_size,
537            sample_barcode: max_sample_errors,
538            sample_size,
539            barcode: max_barcode_errors,
540            barcode_sizes,
541            min_quality,
542        }
543    }
544
545    /// Returns the maximum allowed constant region errors
546    ///
547    /// # Example
548    /// ```
549    /// use barcode_count::info::MaxSeqErrors;
550    ///
551    /// let sample_errors_option = None;
552    /// let sample_barcode_size_option = Some(10);
553    /// let barcode_errors_option = None;
554    /// let barcode_sizes = vec![8,8,8];
555    /// let constant_errors_option = None;
556    /// let constant_region_size = 30;
557    /// let min_quality = 0.0;
558    /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
559    /// assert_eq!(max_sequence_errors.max_constant_errors(), 6);
560    /// let barcode_sizes = vec![8,8,8];
561    /// let constant_errors_option = Some(3);
562    /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
563    /// assert_eq!(max_sequence_errors.max_constant_errors(), 3);
564    /// ```
565    pub fn max_constant_errors(&self) -> u16 {
566        self.constant_region
567    }
568
569    /// Returns the maximum allowed sample barcode errors
570    ///
571    /// # Example
572    /// ```
573    /// use barcode_count::info::MaxSeqErrors;
574    ///
575    /// let sample_errors_option = None;
576    /// let sample_barcode_size_option = Some(10);
577    /// let barcode_errors_option = None;
578    /// let barcode_sizes = vec![8,8,8];
579    /// let constant_errors_option = None;
580    /// let constant_region_size = 30;
581    /// let min_quality = 0.0;
582    /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
583    /// assert_eq!(max_sequence_errors.max_sample_errors(), 2);
584    /// let barcode_sizes = vec![8,8,8];
585    /// let sample_errors_option = Some(3);
586    /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
587    /// assert_eq!(max_sequence_errors.max_sample_errors(), 3);
588    /// ```
589    pub fn max_sample_errors(&self) -> u16 {
590        self.sample_barcode
591    }
592
593    /// Returns the maximum allowed errors within each counted barcode
594    ///
595    /// # Example
596    /// ```
597    /// use barcode_count::info::MaxSeqErrors;
598    ///
599    /// let sample_errors_option = None;
600    /// let sample_barcode_size_option = Some(10);
601    /// let barcode_errors_option = None;
602    /// let barcode_sizes = vec![8,8,8];
603    /// let constant_errors_option = None;
604    /// let constant_region_size = 30;
605    /// let min_quality = 0.0;
606    /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
607    /// assert_eq!(max_sequence_errors.max_barcode_errors(), vec![1,1,1]);
608    /// let barcode_sizes = vec![8,8,8];
609    /// let barcode_errors_option = Some(2);
610    /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
611    /// assert_eq!(max_sequence_errors.max_barcode_errors(), vec![2,2,2]);
612    /// ```
613    pub fn max_barcode_errors(&self) -> &[u16] {
614        &self.barcode
615    }
616}
617
618impl fmt::Display for MaxSeqErrors {
619    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
620        let barcode_size_info;
621        let barcode_error_info;
622        if self.barcode_sizes.len() > 1 {
623            barcode_size_info = format!("Barcode sizes: {:?}", self.barcode_sizes);
624            barcode_error_info = format!(
625                "Maximum mismatches allowed per barcode sequence: {:?}",
626                self.barcode
627            );
628        } else {
629            barcode_size_info = format!("Barcode size: {}", self.barcode_sizes.first().unwrap());
630            barcode_error_info = format!(
631                "Maximum mismatches allowed per barcode sequence: {}",
632                self.barcode.first().unwrap()
633            );
634        }
635        write!(
636            f,
637            "\
638            -BARCODE INFO-\n\
639            Constant region size: {}\n\
640            Maximum mismatches allowed per sequence: {}\n\
641            --------------------------------------------------------------\n\
642            Sample barcode size: {}\n\
643            Maximum mismatches allowed per sequence: {}\n\
644            --------------------------------------------------------------\n\
645            {}\n\
646            {}\n\
647            --------------------------------------------------------------\n\
648            Minimum allowed average read quality score per barcode: {}\n\
649            ",
650            self.constant_region_size,
651            self.constant_region,
652            self.sample_size,
653            self.sample_barcode,
654            barcode_size_info,
655            barcode_error_info,
656            self.min_quality
657        )
658    }
659}
660
661#[derive(Debug)]
662pub enum ResultsHashmap {
663    RandomBarcode(HashMap<String, HashMap<String, AHashSet<String>>>),
664    NoRandomBarcode(HashMap<String, HashMap<String, usize>>),
665}
666
667// A struct which holds the count results, whether that is for a scheme which contains a random barcode or not
668#[derive(Debug)]
669pub struct Results {
670    pub results_hashmap: ResultsHashmap, // holds the counted results
671    empty_count_hash: HashMap<String, usize>, // An empty hashmap that is used a few times and therefor stored within the struct
672    empty_random_hash: HashMap<String, AHashSet<String>>,
673    sample_conversion_omited: bool,
674}
675
676impl Results {
677    /// Create a new Results struct
678    pub fn new(
679        samples_barcode_hash: &HashMap<String, String>,
680        random_barcode: bool,
681        sample_barcode: bool,
682    ) -> Self {
683        let mut results_hashmap;
684        // Create an empty hashmap into the enum depending on whether or not a random barcode is
685        // included
686        if random_barcode {
687            results_hashmap = ResultsHashmap::RandomBarcode(HashMap::new());
688        } else {
689            results_hashmap = ResultsHashmap::NoRandomBarcode(HashMap::new());
690        }
691
692        // If sample name conversion was included, add all sample names to the hashmaps used to count
693        let mut sample_conversion_omited = false;
694        // create empty hashmaps to insert and have the sample name included.  This is so sample name doesn't need to be searched each time
695        let empty_random_hash: HashMap<String, AHashSet<String>> = HashMap::new();
696        let empty_count_hash: HashMap<String, usize> = HashMap::new();
697        // If there is a sample barcode file included, add these as keys in the relevant count hashmap
698        if !samples_barcode_hash.is_empty() {
699            for sample in samples_barcode_hash.keys() {
700                let sample_barcode = sample.to_string();
701                match results_hashmap {
702                    ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
703                        random_hashmap.insert(sample_barcode.clone(), empty_random_hash.clone());
704                    }
705                    ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
706                        count_hashmap.insert(sample_barcode, empty_count_hash.clone());
707                    }
708                }
709            }
710        } else if !sample_barcode {
711            // If there is not a sample barcode within the format, add 'barcode' as key
712            match results_hashmap {
713                ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
714                    random_hashmap.insert("barcode".to_string(), empty_random_hash.clone());
715                }
716                ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
717                    count_hashmap.insert("barcode".to_string(), empty_count_hash.clone());
718                }
719            }
720        } else {
721            // If there is a sample barcode in the format but no sample barcode conversion file,
722            // set the following to true to make sample DNA barcodes into keys later on
723            sample_conversion_omited = true;
724        }
725        // return the Results struct
726        Results {
727            results_hashmap,
728            empty_count_hash,
729            empty_random_hash,
730            sample_conversion_omited,
731        }
732    }
733
734    /// Adds the count to results hashmap
735    pub fn add_count(
736        &mut self,
737        sample_barcode: &str,
738        random_barcode: Option<&String>,
739        barcode_string: String,
740    ) -> bool {
741        // If conversion file does not exist, add the barcode as a key value
742        if self.sample_conversion_omited {
743            match self.results_hashmap {
744                ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
745                    if !count_hashmap.contains_key(sample_barcode) {
746                        count_hashmap
747                            .insert(sample_barcode.to_string(), self.empty_count_hash.clone());
748                    };
749                }
750                ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
751                    if !random_hashmap.contains_key(sample_barcode) {
752                        random_hashmap
753                            .insert(sample_barcode.to_string(), self.empty_random_hash.clone());
754                    };
755                }
756            }
757        };
758
759        match self.results_hashmap {
760            // If random barcode is not included, add the count to this hashmap
761            ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
762                *count_hashmap
763                    .get_mut(sample_barcode)
764                    .unwrap_or(&mut self.empty_count_hash.clone())
765                    .entry(barcode_string)
766                    .or_insert(0) += 1;
767            }
768            // If a random barcode is included, add the random barcode and later use the number of
769            // random barcodes as the count
770            ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
771                // Get the hashmap for the sample
772                let barcodes_hashmap_option = if sample_barcode.is_empty() {
773                    random_hashmap.get_mut("barcode")
774                } else {
775                    random_hashmap.get_mut(sample_barcode)
776                };
777                if let Some(barcodes_hashmap) = barcodes_hashmap_option {
778                    // If the barcodes_hashmap is not empty
779                    // but doesn't contain the barcode
780                    if let std::collections::hash_map::Entry::Vacant(e) = barcodes_hashmap.entry(barcode_string.clone()) {
781                        // insert the hashmap<barcode_id, Set<random_barcodes>>
782                        let mut intermediate_set = AHashSet::new();
783                        intermediate_set
784                            .insert(random_barcode.unwrap_or(&"".to_string()).to_string());
785                        e.insert(intermediate_set);
786                    } else {
787                        // if the hashmap<sample_id, hashmap<barcode_id, Set<>> exists, check to see if the random barcode already was inserted
788                        let random_set = barcodes_hashmap.get_mut(&barcode_string).unwrap();
789                        return random_set
790                            .insert(random_barcode.unwrap_or(&"".to_string()).to_string());
791                    }
792                } else {
793                    // create the Set<RandomBarcode>
794                    let mut intermediate_set = AHashSet::new();
795                    intermediate_set.insert(random_barcode.unwrap_or(&"".to_string()).to_string());
796                    let mut intermediate_hash = HashMap::new();
797                    // create the HashMap<barcode_id, Set<RandomBarcodes>>
798                    intermediate_hash.insert(barcode_string.to_string(), intermediate_set);
799                    // insert this into the random_hashmap connected to the sample_ID
800                    random_hashmap.insert(sample_barcode.to_string(), intermediate_hash);
801                }
802            }
803        }
804
805        // Return that a count was added.  An earlier return value is used for when a random
806        // barcode is already within the results
807        true
808    }
809}
810
811/// A struct which holds hte enriched single and double counted barcodes.  Useful for DEL.  This struct is used during output.
812pub struct ResultsEnrichment {
813    pub single_hashmap: HashMap<String, HashMap<String, usize>>, // enrichment of single barcodes hash used at output
814    pub double_hashmap: HashMap<String, HashMap<String, usize>>, // enrichment of double barcodes hash used at output
815    empty_count_hash: HashMap<String, usize>,
816}
817
818impl ResultsEnrichment {
819    pub fn new() -> Self {
820        let empty_count_hash: HashMap<String, usize> = HashMap::new();
821        ResultsEnrichment {
822            single_hashmap: HashMap::new(),
823            double_hashmap: HashMap::new(),
824            empty_count_hash,
825        }
826    }
827
828    /// Adds sample barcodes for keys within the hashmaps.  This is added later in order to first initiate the struct then add sample barcodes later
829    pub fn add_sample_barcodes(&mut self, samples_barcodes: &[String]) {
830        // For each sample barcode, create a sample barcode key to empty hashmap into single and double enrichment hashmaps
831        for sample_barcode in samples_barcodes {
832            self.single_hashmap
833                .insert(sample_barcode.to_string(), self.empty_count_hash.clone());
834            self.double_hashmap
835                .insert(sample_barcode.to_string(), self.empty_count_hash.clone());
836        }
837    }
838
839    /// Adds the count the the single barcode enrichment hashmap
840    pub fn add_single(&mut self, sample_id: &str, barcode_string: &str, count: usize) {
841        // get the number of barcodes to know homu much to iterate
842        let barcode_num = barcode_string.split(',').count();
843        // For each single barcode in the comma separate barcodes, create a new string with just one barcode and empty other columns
844        for (index, single_barcode) in barcode_string.split(',').enumerate() {
845            let mut single_barcode_string = String::new();
846            // Recreate the new comma separated barcode with only one barcode
847            for x in 0..barcode_num {
848                // If the index is sthe same as x, add the single barcode.  This should put it in the right column
849                if x == index {
850                    single_barcode_string.push_str(single_barcode);
851                }
852                // Don't add a comma at the end
853                if x != (barcode_num - 1) {
854                    single_barcode_string.push(',');
855                }
856            }
857            // Insert 0 if the barcodes are not within the single_hashmap -> barcodes
858            // Then add one regardless
859            *self
860                .single_hashmap
861                .get_mut(sample_id)
862                .unwrap_or(&mut self.empty_count_hash.clone())
863                .entry(single_barcode_string)
864                .or_insert(0) += count;
865        }
866    }
867
868    /// Adds the count to the double barcode enrichment hashmap
869    pub fn add_double(&mut self, sample_id: &str, barcode_string: &str, count: usize) {
870        // get the number of barcodes to know homu much to iterate
871        let barcode_num = barcode_string.split(',').count();
872        // split the barcodes into a vec from their comma separated form
873        let barcode_split = barcode_string.split(',').collect::<Vec<&str>>();
874        // iterate through the number of barcode_num - 1, and take this index for the first barcode
875        for first_barcode_index in 0..(barcode_num - 1) {
876            // Get the amount needed to add to the first index in order to get the second index.  This is iterated to account for the second being the next barcode or two away etc. Eg from 1,2,3 = 1,2,, and 1,,3
877            for next_barcode_add in 1..(barcode_num - first_barcode_index) {
878                // Initiate the new barcode string
879                let mut double_barcode_string = String::new();
880                // Iterate over each comma separated column and insert the barcode if needed
881                for column_index in 0..barcode_num {
882                    // If it is either the first or second barcode, add comma separated to the new string
883                    if column_index == first_barcode_index {
884                        double_barcode_string.push_str(barcode_split[first_barcode_index])
885                    } else if column_index == (first_barcode_index + next_barcode_add) {
886                        double_barcode_string
887                            .push_str(barcode_split[first_barcode_index + next_barcode_add])
888                    }
889                    // If we are not on the last barcode, add a comma
890                    if column_index != (barcode_num - 1) {
891                        double_barcode_string.push(',')
892                    }
893                }
894                // Insert 0 if the barcodes are not within the double_hashmap -> barcodes
895                // Then add one regardless
896                *self
897                    .double_hashmap
898                    .get_mut(sample_id)
899                    .unwrap_or(&mut self.empty_count_hash.clone())
900                    .entry(double_barcode_string)
901                    .or_insert(0) += count;
902            }
903        }
904    }
905}
906
907impl Default for ResultsEnrichment {
908    fn default() -> Self {
909        Self::new()
910    }
911}