barcode_count/info.rs
1use ahash::{AHashSet, HashMap, HashMapExt};
2use anyhow::{anyhow, Context, Result};
3use itertools::Itertools;
4use num_format::{Locale, ToFormattedString};
5use regex::Regex;
6use std::{
7 fmt, fs,
8 sync::{
9 atomic::{AtomicU32, Ordering},
10 Arc,
11 },
12};
13
14// Struct to keep track of sequencing errors and correct matches. This is displayed at the end of the algorithm for QC measures
15#[derive(Debug, Clone)]
16pub struct SequenceErrors {
17 constant_region: Arc<AtomicU32>, // errors within the constant region
18 sample_barcode: Arc<AtomicU32>, // errors within the sample barcode
19 barcode: Arc<AtomicU32>, // erors within the counted barcode
20 matched: Arc<AtomicU32>, // total matched
21 duplicates: Arc<AtomicU32>, // total random barcode duplicates
22 low_quality: Arc<AtomicU32>, // total random barcode duplicates
23}
24
25impl Default for SequenceErrors {
26 fn default() -> Self {
27 Self::new()
28 }
29}
30
31impl SequenceErrors {
32 /// Create a new sequence error struct. Starts with 0 errors in all regions, then is added to later.
33 ///
34 /// # Example
35 /// ```
36 /// use barcode_count::info::SequenceErrors;
37 ///
38 /// let mut sequence_errors = SequenceErrors::new();
39 /// ```
40 pub fn new() -> Self {
41 SequenceErrors {
42 constant_region: Arc::new(AtomicU32::new(0)),
43 sample_barcode: Arc::new(AtomicU32::new(0)),
44 barcode: Arc::new(AtomicU32::new(0)),
45 matched: Arc::new(AtomicU32::new(0)),
46 duplicates: Arc::new(AtomicU32::new(0)),
47 low_quality: Arc::new(AtomicU32::new(0)),
48 }
49 }
50
51 /// Add one to constant region error
52 ///
53 /// # Example
54 /// ```
55 /// use barcode_count::info::SequenceErrors;
56 ///
57 /// let mut sequence_errors = SequenceErrors::new();
58 /// sequence_errors.constant_region_error();
59 /// ```
60 pub fn constant_region_error(&mut self) {
61 self.constant_region.fetch_add(1, Ordering::Relaxed);
62 }
63
64 /// Add one to sample barcode error
65 ///
66 /// # Example
67 /// ```
68 /// use barcode_count::info::SequenceErrors;
69 ///
70 /// let mut sequence_errors = SequenceErrors::new();
71 /// sequence_errors.sample_barcode_error();
72 /// ```
73 pub fn sample_barcode_error(&mut self) {
74 self.sample_barcode.fetch_add(1, Ordering::Relaxed);
75 }
76
77 /// Add one to barcode error
78 ///
79 /// # Example
80 /// ```
81 /// use barcode_count::info::SequenceErrors;
82 ///
83 /// let mut sequence_errors = SequenceErrors::new();
84 /// sequence_errors.barcode_error();
85 /// ```
86 pub fn barcode_error(&mut self) {
87 self.barcode.fetch_add(1, Ordering::Relaxed);
88 }
89
90 /// Add one to correct match
91 ///
92 /// # Example
93 /// ```
94 /// use barcode_count::info::SequenceErrors;
95 ///
96 /// let mut sequence_errors = SequenceErrors::new();
97 /// sequence_errors.correct_match();
98 /// ```
99 pub fn correct_match(&mut self) {
100 self.matched.fetch_add(1, Ordering::Relaxed);
101 }
102
103 /// Add one to duplicates
104 ///
105 /// # Example
106 /// ```
107 /// use barcode_count::info::SequenceErrors;
108 ///
109 /// let mut sequence_errors = SequenceErrors::new();
110 /// sequence_errors.duplicated();
111 /// ```
112 pub fn duplicated(&mut self) {
113 self.duplicates.fetch_add(1, Ordering::Relaxed);
114 }
115
116 /// Add one to low_quality
117 ///
118 /// # Example
119 /// ```
120 /// use barcode_count::info::SequenceErrors;
121 ///
122 /// let mut sequence_errors = SequenceErrors::new();
123 /// sequence_errors.low_quality_barcode();
124 /// ```
125 pub fn low_quality_barcode(&mut self) {
126 self.low_quality.fetch_add(1, Ordering::Relaxed);
127 }
128
129 pub fn arc_clone(&self) -> SequenceErrors {
130 SequenceErrors {
131 constant_region: Arc::clone(&self.constant_region),
132 sample_barcode: Arc::clone(&self.sample_barcode),
133 barcode: Arc::clone(&self.barcode),
134 matched: Arc::clone(&self.matched),
135 duplicates: Arc::clone(&self.duplicates),
136 low_quality: Arc::clone(&self.low_quality),
137 }
138 }
139}
140
141impl fmt::Display for SequenceErrors {
142 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
143 write!(
144 f,
145 "\
146 Correctly matched sequences: {}\n\
147 Constant region mismatches: {}\n\
148 Sample barcode mismatches: {}\n\
149 Counted barcode mismatches: {}\n\
150 Duplicates: {}\n\
151 Low quality barcodes: {}",
152 self.matched
153 .load(Ordering::Relaxed)
154 .to_formatted_string(&Locale::en),
155 self.constant_region
156 .load(Ordering::Relaxed)
157 .to_formatted_string(&Locale::en),
158 self.sample_barcode
159 .load(Ordering::Relaxed)
160 .to_formatted_string(&Locale::en),
161 self.barcode
162 .load(Ordering::Relaxed)
163 .to_formatted_string(&Locale::en),
164 self.duplicates
165 .load(Ordering::Relaxed)
166 .to_formatted_string(&Locale::en),
167 self.low_quality
168 .load(Ordering::Relaxed)
169 .to_formatted_string(&Locale::en)
170 )
171 }
172}
173
174// Struct to keep the format information for the sequencing, ie barcodes, regex search etc.
175#[derive(Debug, Clone)]
176pub struct SequenceFormat {
177 pub format_string: String, // sequence with 'N's replacing barcodes
178 pub regions_string: String, // String with each region contain a code
179 pub length: usize, // Total length of format sequence
180 pub constant_region_length: u16, // Length of only the consant nucleotides
181 pub format_regex: Regex, // The regex search used to find barcodes
182 pub barcode_num: usize, // Number of counted barcodes. More for DEL
183 pub barcode_lengths: Vec<u16>, // The length of each counted barcode
184 pub sample_length_option: Option<u16>, // Sample barcode length
185 pub random_barcode: bool, // Whether a random barcode is included
186 pub sample_barcode: bool, // Whether a sammple barcode is included
187}
188
189impl SequenceFormat {
190 /// Creates a new empty SequenceFormat struct
191 ///
192 /// # Example
193 /// ```
194 /// use barcode_count::info::SequenceFormat;
195 ///
196 /// let sequence_format = SequenceFormat::new();
197 /// ```
198 pub fn new() -> Result<Self> {
199 let empty_regex = Regex::new("")?;
200 Ok(SequenceFormat {
201 format_string: String::new(),
202 regions_string: String::new(),
203 length: 0,
204 constant_region_length: 0,
205 format_regex: empty_regex,
206 barcode_num: 0,
207 barcode_lengths: Vec::new(),
208 sample_length_option: None,
209 random_barcode: false,
210 sample_barcode: false,
211 })
212 }
213 /// Parses the format file into all fields of the SequenceFormat struct, including the regex
214 /// search, barcode sizes, and sequence format strings.
215 pub fn parse_format_file(format_path: &str) -> Result<Self> {
216 let mut sequence_format = SequenceFormat::new()?;
217 // Read sequence format file to string
218 let format_data = fs::read_to_string(format_path)
219 .context(format!("Failed to open {}", format_path))?
220 .lines() // split into lines
221 .filter(|line| !line.starts_with('#')) // remove any line that starts with '#'
222 .collect::<String>(); // collect into a String
223
224 // Starts the string that is used to create the regex search
225 let mut regex_string = String::new();
226 // Digit search to find the number within any format group
227 let digit_search = Regex::new(r"\d+")?;
228 // Search groups separated by '|' or statements in order to iterate through each group
229 // within the format data from the format file and create the regex search string, along
230 // with add the other needed information. Uses the {#}, [#], (#), [ATGC], and 'N's as
231 // groups
232 let barcode_search = Regex::new(r"(?i)(\{\d+\})|(\[\d+\])|(\(\d+\))|N+|[ATGC]+")?;
233 for group in barcode_search.find_iter(&format_data) {
234 let group_str = group.as_str();
235 // Holds the capture group name. Is non-barcode regions
236 let mut group_name_option = None;
237
238 // If the group is a barcode group, add the capture group name, and set barcode
239 // included fields to true
240 if group_str.contains('[') {
241 group_name_option = Some("sample".to_string());
242 sequence_format.sample_barcode = true;
243 } else if group_str.contains('{') {
244 sequence_format.barcode_num += 1;
245 group_name_option = Some(format!("barcode{}", sequence_format.barcode_num));
246 } else if group_str.contains('(') {
247 group_name_option = Some("random".to_string());
248 sequence_format.random_barcode = true;
249 }
250
251 if let Some(group_name) = group_name_option {
252 let digits = digit_search
253 .captures(group_str)
254 .unwrap()
255 .get(0)
256 .unwrap()
257 .as_str()
258 .parse::<u16>()
259 .unwrap();
260
261 // Create the capture group with the group name for the barcode and add it to the
262 // string created for the regex search
263 let mut capture_group = format!("(?P<{}>.", group_name);
264 capture_group.push('{');
265 capture_group.push_str(&digits.to_string());
266 capture_group.push_str("})");
267 regex_string.push_str(&capture_group);
268
269 // Add lengths of any of the barcodes to the sequence_format struct fields. Also
270 // set the code for the regions_string
271 let mut push_char = '\0';
272 if group_name == "sample" {
273 sequence_format.sample_length_option = Some(digits);
274 push_char = 'S'
275 } else if group_name.contains("barcode") {
276 sequence_format.barcode_lengths.push(digits);
277 push_char = 'B'
278 } else if group_name == "random" {
279 push_char = 'R'
280 }
281 // For the number of nucleotides of the barcode add 'N's to format string and the
282 // push_char just set to regions_string
283 for _ in 0..digits {
284 sequence_format.regions_string.push(push_char);
285 sequence_format.format_string.push('N')
286 }
287 } else if group_str.contains('N') {
288 // Used to handle if 'N's are added to the format file. These will be treated as
289 // 'any' nucleotide for error handling and matching
290 let num_of_ns = group_str.matches('N').count();
291 let mut n_group = "[AGCT]{".to_string();
292 n_group.push_str(&num_of_ns.to_string());
293 n_group.push('}');
294 regex_string.push_str(&n_group);
295 sequence_format.format_string.push_str(group_str);
296 } else {
297 // Any A,G,C, or T is treated as constant region here
298 regex_string.push_str(&group_str.to_uppercase());
299 sequence_format.format_string.push_str(group_str);
300 let constant_group_length = group_str.chars().count();
301 for _ in 0..constant_group_length {
302 sequence_format.regions_string.push('C');
303 }
304 sequence_format.constant_region_length += constant_group_length as u16;
305 }
306 }
307 sequence_format.length = sequence_format.format_string.chars().count();
308 sequence_format.format_regex = Regex::new(®ex_string)?;
309 Ok(sequence_format)
310 }
311}
312
313impl fmt::Display for SequenceFormat {
314 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
315 let mut key = String::new();
316 let mut new_char = AHashSet::new();
317 for key_char in self.regions_string.chars() {
318 if new_char.insert(key_char) {
319 let key_info = match key_char {
320 'S' => "\nS: Sample barcode",
321 'B' => "\nB: Counted barcode",
322 'C' => "\nC: Constant region",
323 'R' => "\nR: Random barcode",
324 _ => "",
325 };
326 key.push_str(key_info);
327 }
328 }
329 write!(
330 f,
331 "-FORMAT-\n{}\n{}{}",
332 self.format_string, self.regions_string, key
333 )
334 }
335}
336
337/// Contains all possible barcode sequences for error handling and barcode to ID conversion
338pub struct BarcodeConversions {
339 pub samples_barcode_hash: HashMap<String, String>,
340 pub sample_seqs: AHashSet<String>,
341 pub counted_barcodes_hash: Vec<HashMap<String, String>>,
342 pub counted_barcode_seqs: Vec<AHashSet<String>>,
343}
344
345impl Default for BarcodeConversions {
346 fn default() -> Self {
347 Self::new()
348 }
349}
350
351impl BarcodeConversions {
352 /// Creates an empty BarcodeConversions struct
353 pub fn new() -> Self {
354 BarcodeConversions {
355 samples_barcode_hash: HashMap::new(),
356 sample_seqs: AHashSet::new(),
357 counted_barcodes_hash: Vec::new(),
358 counted_barcode_seqs: Vec::new(),
359 }
360 }
361
362 /// Reads in comma separated barcode file (CSV). The columns need to have headers. The first column needs to be the nucleotide barcode
363 /// and the second needs to be the ID
364 pub fn sample_barcode_file_conversion(&mut self, barcode_path: &str) -> Result<()> {
365 // read in the sample barcode file
366 for (barcode, sample_id) in fs::read_to_string(barcode_path)
367 .context(format!("Failed to open {}", barcode_path))?
368 .lines() // split the lines
369 .skip(1) // skip the first line which should be the header
370 .map(|line| {
371 line.split(',')
372 .take(2) // take only the first two values, or columns
373 .map(|value| value.to_string())
374 .collect_tuple()
375 .unwrap_or(("".to_string(), "".to_string()))
376 })
377 {
378 self.samples_barcode_hash.insert(barcode, sample_id);
379 }
380 Ok(())
381 }
382
383 /// Reads in comma separated barcode file (CSV). The columns need to have headers. The first column needs to be the nucleotide barcode
384 /// the second needs to be the ID, and the third needs to be the barcode index location
385 ///
386 /// # Panics
387 ///
388 /// This panics if the third column of the barcode conversion file does not contain integers. Also
389 /// panics if not all integers for barcode numbers is within this columns
390 pub fn barcode_file_conversion(
391 &mut self,
392 barcode_path: &str,
393 barcode_num: usize,
394 ) -> Result<()> {
395 // read in the sample barcode file
396 let barcode_vecs = fs::read_to_string(barcode_path)
397 .context(format!("Failed to read {}", barcode_path))?
398 .lines() // split the lines
399 .skip(1) // skip the first line which should be the header
400 .map(|line| {
401 line.split(',')
402 .take(3) // take only the first three values, or columns
403 .map(|value| value.to_string())
404 .collect_tuple()
405 .unwrap_or(("".to_string(), "".to_string(), "".to_string()))
406 }) // comma split the line into a tuple with the first being the key and the last the value
407 .collect::<Vec<(String, String, String)>>();
408 for _ in 0..barcode_num {
409 self.counted_barcodes_hash.push(HashMap::new());
410 }
411 let mut barcode_num_contained = AHashSet::new();
412 for (barcode, id, barcode_num) in barcode_vecs {
413 let barcode_num_usize = barcode_num.parse::<usize>().context(format!(
414 "Third column of barcode file contains something other than an integer: {}",
415 barcode_num
416 ))? - 1;
417 barcode_num_contained.insert(barcode_num_usize);
418 self.counted_barcodes_hash[barcode_num_usize].insert(barcode, id);
419 }
420 let mut missing_barcode_num = Vec::new();
421 for x in 0..barcode_num {
422 if !barcode_num_contained.contains(&x) {
423 missing_barcode_num.push(x)
424 }
425 }
426 if !missing_barcode_num.is_empty() {
427 return Err(anyhow!(format!(
428 "Barcode conversion file missing barcode numers {:?} in the third column",
429 missing_barcode_num
430 )));
431 }
432 Ok(())
433 }
434 /// Creates a hashmap of all sample barcode sequences in order to compare for sequencing errors
435 pub fn get_sample_seqs(&mut self) {
436 if !self.samples_barcode_hash.is_empty() {
437 for sample_barcode in self.samples_barcode_hash.keys() {
438 self.sample_seqs.insert(sample_barcode.to_string());
439 }
440 }
441 }
442
443 /// Creates a hashmap of all counted barcode sequences in order to compare for sequencing errors
444 pub fn get_barcode_seqs(&mut self) {
445 if !self.counted_barcodes_hash.is_empty() {
446 self.counted_barcode_seqs = self
447 .counted_barcodes_hash
448 .iter()
449 .map(|hash| {
450 hash.keys()
451 .map(|key| key.to_string())
452 .collect::<AHashSet<String>>()
453 }) // creates a hashset for each sequential barcode, then collects into a vector with the index being each sequential counted barcode
454 .collect::<Vec<AHashSet<String>>>();
455 }
456 }
457}
458
459/// Struct of how many sequencing errrors are allowed
460#[derive(Debug, Clone, PartialEq)]
461pub struct MaxSeqErrors {
462 // errors within the constant region
463 constant_region: u16,
464 constant_region_size: u16,
465 // errors within the sample barcode
466 sample_barcode: u16,
467 sample_size: u16,
468 // erors within the counted barcode
469 barcode: Vec<u16>,
470 barcode_sizes: Vec<u16>,
471 min_quality: f32,
472}
473
474impl MaxSeqErrors {
475 /// Create a new sequence error struct
476 ///
477 /// # Example
478 /// ```
479 /// use barcode_count::info::MaxSeqErrors;
480 ///
481 /// let sample_errors_option = None;
482 /// let sample_barcode_size_option = Some(10);
483 /// let barcode_errors_option = None;
484 /// let barcode_sizes = vec![8,8,8];
485 /// let constant_errors_option = None;
486 /// let constant_region_size = 30;
487 /// let min_quality = 0.0;
488 /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
489 /// ```
490 pub fn new(
491 sample_errors_option: Option<u16>,
492 sample_barcode_size_option: Option<u16>,
493 barcode_errors_option: Option<u16>,
494 barcode_sizes: Vec<u16>,
495 constant_errors_option: Option<u16>,
496 constant_region_size: u16,
497 min_quality: f32,
498 ) -> Self {
499 let max_sample_errors;
500 // start with a sample size of 0 in case there is no sample barcode. If there is then mutate
501 let mut sample_size = 0;
502 // If sample barcode was included, calculate the maximum error, otherwise set error to 0
503 if let Some(sample_size_actual) = sample_barcode_size_option {
504 sample_size = sample_size_actual;
505 // if there was sample errors input from arguments, use that, otherwise calculate 20% for max errors
506 if let Some(sample_errors) = sample_errors_option {
507 max_sample_errors = sample_errors
508 } else {
509 max_sample_errors = sample_size_actual / 5;
510 }
511 } else {
512 max_sample_errors = 0;
513 }
514
515 let mut max_barcode_errors = Vec::new();
516 // If max error was set by input arguments, use that value, otherwise calculate 20% of barcode size for max error
517 for barcode_size in &barcode_sizes {
518 if let Some(barcode_errors) = barcode_errors_option {
519 max_barcode_errors.push(barcode_errors);
520 } else {
521 max_barcode_errors.push(barcode_size / 5);
522 }
523 }
524
525 let max_constant_errors;
526 // If max error was set by input arguments, use that value, otherwise calculate 20% of barcode size for max error
527 if let Some(constant_errors) = constant_errors_option {
528 max_constant_errors = constant_errors
529 } else {
530 max_constant_errors = constant_region_size / 5;
531 // errors allowed is the length of the constant region - the Ns / 5 or 20%
532 }
533
534 MaxSeqErrors {
535 constant_region: max_constant_errors,
536 constant_region_size,
537 sample_barcode: max_sample_errors,
538 sample_size,
539 barcode: max_barcode_errors,
540 barcode_sizes,
541 min_quality,
542 }
543 }
544
545 /// Returns the maximum allowed constant region errors
546 ///
547 /// # Example
548 /// ```
549 /// use barcode_count::info::MaxSeqErrors;
550 ///
551 /// let sample_errors_option = None;
552 /// let sample_barcode_size_option = Some(10);
553 /// let barcode_errors_option = None;
554 /// let barcode_sizes = vec![8,8,8];
555 /// let constant_errors_option = None;
556 /// let constant_region_size = 30;
557 /// let min_quality = 0.0;
558 /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
559 /// assert_eq!(max_sequence_errors.max_constant_errors(), 6);
560 /// let barcode_sizes = vec![8,8,8];
561 /// let constant_errors_option = Some(3);
562 /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
563 /// assert_eq!(max_sequence_errors.max_constant_errors(), 3);
564 /// ```
565 pub fn max_constant_errors(&self) -> u16 {
566 self.constant_region
567 }
568
569 /// Returns the maximum allowed sample barcode errors
570 ///
571 /// # Example
572 /// ```
573 /// use barcode_count::info::MaxSeqErrors;
574 ///
575 /// let sample_errors_option = None;
576 /// let sample_barcode_size_option = Some(10);
577 /// let barcode_errors_option = None;
578 /// let barcode_sizes = vec![8,8,8];
579 /// let constant_errors_option = None;
580 /// let constant_region_size = 30;
581 /// let min_quality = 0.0;
582 /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
583 /// assert_eq!(max_sequence_errors.max_sample_errors(), 2);
584 /// let barcode_sizes = vec![8,8,8];
585 /// let sample_errors_option = Some(3);
586 /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
587 /// assert_eq!(max_sequence_errors.max_sample_errors(), 3);
588 /// ```
589 pub fn max_sample_errors(&self) -> u16 {
590 self.sample_barcode
591 }
592
593 /// Returns the maximum allowed errors within each counted barcode
594 ///
595 /// # Example
596 /// ```
597 /// use barcode_count::info::MaxSeqErrors;
598 ///
599 /// let sample_errors_option = None;
600 /// let sample_barcode_size_option = Some(10);
601 /// let barcode_errors_option = None;
602 /// let barcode_sizes = vec![8,8,8];
603 /// let constant_errors_option = None;
604 /// let constant_region_size = 30;
605 /// let min_quality = 0.0;
606 /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
607 /// assert_eq!(max_sequence_errors.max_barcode_errors(), vec![1,1,1]);
608 /// let barcode_sizes = vec![8,8,8];
609 /// let barcode_errors_option = Some(2);
610 /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality);
611 /// assert_eq!(max_sequence_errors.max_barcode_errors(), vec![2,2,2]);
612 /// ```
613 pub fn max_barcode_errors(&self) -> &[u16] {
614 &self.barcode
615 }
616}
617
618impl fmt::Display for MaxSeqErrors {
619 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
620 let barcode_size_info;
621 let barcode_error_info;
622 if self.barcode_sizes.len() > 1 {
623 barcode_size_info = format!("Barcode sizes: {:?}", self.barcode_sizes);
624 barcode_error_info = format!(
625 "Maximum mismatches allowed per barcode sequence: {:?}",
626 self.barcode
627 );
628 } else {
629 barcode_size_info = format!("Barcode size: {}", self.barcode_sizes.first().unwrap());
630 barcode_error_info = format!(
631 "Maximum mismatches allowed per barcode sequence: {}",
632 self.barcode.first().unwrap()
633 );
634 }
635 write!(
636 f,
637 "\
638 -BARCODE INFO-\n\
639 Constant region size: {}\n\
640 Maximum mismatches allowed per sequence: {}\n\
641 --------------------------------------------------------------\n\
642 Sample barcode size: {}\n\
643 Maximum mismatches allowed per sequence: {}\n\
644 --------------------------------------------------------------\n\
645 {}\n\
646 {}\n\
647 --------------------------------------------------------------\n\
648 Minimum allowed average read quality score per barcode: {}\n\
649 ",
650 self.constant_region_size,
651 self.constant_region,
652 self.sample_size,
653 self.sample_barcode,
654 barcode_size_info,
655 barcode_error_info,
656 self.min_quality
657 )
658 }
659}
660
661#[derive(Debug)]
662pub enum ResultsHashmap {
663 RandomBarcode(HashMap<String, HashMap<String, AHashSet<String>>>),
664 NoRandomBarcode(HashMap<String, HashMap<String, usize>>),
665}
666
667// A struct which holds the count results, whether that is for a scheme which contains a random barcode or not
668#[derive(Debug)]
669pub struct Results {
670 pub results_hashmap: ResultsHashmap, // holds the counted results
671 empty_count_hash: HashMap<String, usize>, // An empty hashmap that is used a few times and therefor stored within the struct
672 empty_random_hash: HashMap<String, AHashSet<String>>,
673 sample_conversion_omited: bool,
674}
675
676impl Results {
677 /// Create a new Results struct
678 pub fn new(
679 samples_barcode_hash: &HashMap<String, String>,
680 random_barcode: bool,
681 sample_barcode: bool,
682 ) -> Self {
683 let mut results_hashmap;
684 // Create an empty hashmap into the enum depending on whether or not a random barcode is
685 // included
686 if random_barcode {
687 results_hashmap = ResultsHashmap::RandomBarcode(HashMap::new());
688 } else {
689 results_hashmap = ResultsHashmap::NoRandomBarcode(HashMap::new());
690 }
691
692 // If sample name conversion was included, add all sample names to the hashmaps used to count
693 let mut sample_conversion_omited = false;
694 // create empty hashmaps to insert and have the sample name included. This is so sample name doesn't need to be searched each time
695 let empty_random_hash: HashMap<String, AHashSet<String>> = HashMap::new();
696 let empty_count_hash: HashMap<String, usize> = HashMap::new();
697 // If there is a sample barcode file included, add these as keys in the relevant count hashmap
698 if !samples_barcode_hash.is_empty() {
699 for sample in samples_barcode_hash.keys() {
700 let sample_barcode = sample.to_string();
701 match results_hashmap {
702 ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
703 random_hashmap.insert(sample_barcode.clone(), empty_random_hash.clone());
704 }
705 ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
706 count_hashmap.insert(sample_barcode, empty_count_hash.clone());
707 }
708 }
709 }
710 } else if !sample_barcode {
711 // If there is not a sample barcode within the format, add 'barcode' as key
712 match results_hashmap {
713 ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
714 random_hashmap.insert("barcode".to_string(), empty_random_hash.clone());
715 }
716 ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
717 count_hashmap.insert("barcode".to_string(), empty_count_hash.clone());
718 }
719 }
720 } else {
721 // If there is a sample barcode in the format but no sample barcode conversion file,
722 // set the following to true to make sample DNA barcodes into keys later on
723 sample_conversion_omited = true;
724 }
725 // return the Results struct
726 Results {
727 results_hashmap,
728 empty_count_hash,
729 empty_random_hash,
730 sample_conversion_omited,
731 }
732 }
733
734 /// Adds the count to results hashmap
735 pub fn add_count(
736 &mut self,
737 sample_barcode: &str,
738 random_barcode: Option<&String>,
739 barcode_string: String,
740 ) -> bool {
741 // If conversion file does not exist, add the barcode as a key value
742 if self.sample_conversion_omited {
743 match self.results_hashmap {
744 ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
745 if !count_hashmap.contains_key(sample_barcode) {
746 count_hashmap
747 .insert(sample_barcode.to_string(), self.empty_count_hash.clone());
748 };
749 }
750 ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
751 if !random_hashmap.contains_key(sample_barcode) {
752 random_hashmap
753 .insert(sample_barcode.to_string(), self.empty_random_hash.clone());
754 };
755 }
756 }
757 };
758
759 match self.results_hashmap {
760 // If random barcode is not included, add the count to this hashmap
761 ResultsHashmap::NoRandomBarcode(ref mut count_hashmap) => {
762 *count_hashmap
763 .get_mut(sample_barcode)
764 .unwrap_or(&mut self.empty_count_hash.clone())
765 .entry(barcode_string)
766 .or_insert(0) += 1;
767 }
768 // If a random barcode is included, add the random barcode and later use the number of
769 // random barcodes as the count
770 ResultsHashmap::RandomBarcode(ref mut random_hashmap) => {
771 // Get the hashmap for the sample
772 let barcodes_hashmap_option = if sample_barcode.is_empty() {
773 random_hashmap.get_mut("barcode")
774 } else {
775 random_hashmap.get_mut(sample_barcode)
776 };
777 if let Some(barcodes_hashmap) = barcodes_hashmap_option {
778 // If the barcodes_hashmap is not empty
779 // but doesn't contain the barcode
780 if let std::collections::hash_map::Entry::Vacant(e) = barcodes_hashmap.entry(barcode_string.clone()) {
781 // insert the hashmap<barcode_id, Set<random_barcodes>>
782 let mut intermediate_set = AHashSet::new();
783 intermediate_set
784 .insert(random_barcode.unwrap_or(&"".to_string()).to_string());
785 e.insert(intermediate_set);
786 } else {
787 // if the hashmap<sample_id, hashmap<barcode_id, Set<>> exists, check to see if the random barcode already was inserted
788 let random_set = barcodes_hashmap.get_mut(&barcode_string).unwrap();
789 return random_set
790 .insert(random_barcode.unwrap_or(&"".to_string()).to_string());
791 }
792 } else {
793 // create the Set<RandomBarcode>
794 let mut intermediate_set = AHashSet::new();
795 intermediate_set.insert(random_barcode.unwrap_or(&"".to_string()).to_string());
796 let mut intermediate_hash = HashMap::new();
797 // create the HashMap<barcode_id, Set<RandomBarcodes>>
798 intermediate_hash.insert(barcode_string.to_string(), intermediate_set);
799 // insert this into the random_hashmap connected to the sample_ID
800 random_hashmap.insert(sample_barcode.to_string(), intermediate_hash);
801 }
802 }
803 }
804
805 // Return that a count was added. An earlier return value is used for when a random
806 // barcode is already within the results
807 true
808 }
809}
810
811/// A struct which holds hte enriched single and double counted barcodes. Useful for DEL. This struct is used during output.
812pub struct ResultsEnrichment {
813 pub single_hashmap: HashMap<String, HashMap<String, usize>>, // enrichment of single barcodes hash used at output
814 pub double_hashmap: HashMap<String, HashMap<String, usize>>, // enrichment of double barcodes hash used at output
815 empty_count_hash: HashMap<String, usize>,
816}
817
818impl ResultsEnrichment {
819 pub fn new() -> Self {
820 let empty_count_hash: HashMap<String, usize> = HashMap::new();
821 ResultsEnrichment {
822 single_hashmap: HashMap::new(),
823 double_hashmap: HashMap::new(),
824 empty_count_hash,
825 }
826 }
827
828 /// Adds sample barcodes for keys within the hashmaps. This is added later in order to first initiate the struct then add sample barcodes later
829 pub fn add_sample_barcodes(&mut self, samples_barcodes: &[String]) {
830 // For each sample barcode, create a sample barcode key to empty hashmap into single and double enrichment hashmaps
831 for sample_barcode in samples_barcodes {
832 self.single_hashmap
833 .insert(sample_barcode.to_string(), self.empty_count_hash.clone());
834 self.double_hashmap
835 .insert(sample_barcode.to_string(), self.empty_count_hash.clone());
836 }
837 }
838
839 /// Adds the count the the single barcode enrichment hashmap
840 pub fn add_single(&mut self, sample_id: &str, barcode_string: &str, count: usize) {
841 // get the number of barcodes to know homu much to iterate
842 let barcode_num = barcode_string.split(',').count();
843 // For each single barcode in the comma separate barcodes, create a new string with just one barcode and empty other columns
844 for (index, single_barcode) in barcode_string.split(',').enumerate() {
845 let mut single_barcode_string = String::new();
846 // Recreate the new comma separated barcode with only one barcode
847 for x in 0..barcode_num {
848 // If the index is sthe same as x, add the single barcode. This should put it in the right column
849 if x == index {
850 single_barcode_string.push_str(single_barcode);
851 }
852 // Don't add a comma at the end
853 if x != (barcode_num - 1) {
854 single_barcode_string.push(',');
855 }
856 }
857 // Insert 0 if the barcodes are not within the single_hashmap -> barcodes
858 // Then add one regardless
859 *self
860 .single_hashmap
861 .get_mut(sample_id)
862 .unwrap_or(&mut self.empty_count_hash.clone())
863 .entry(single_barcode_string)
864 .or_insert(0) += count;
865 }
866 }
867
868 /// Adds the count to the double barcode enrichment hashmap
869 pub fn add_double(&mut self, sample_id: &str, barcode_string: &str, count: usize) {
870 // get the number of barcodes to know homu much to iterate
871 let barcode_num = barcode_string.split(',').count();
872 // split the barcodes into a vec from their comma separated form
873 let barcode_split = barcode_string.split(',').collect::<Vec<&str>>();
874 // iterate through the number of barcode_num - 1, and take this index for the first barcode
875 for first_barcode_index in 0..(barcode_num - 1) {
876 // Get the amount needed to add to the first index in order to get the second index. This is iterated to account for the second being the next barcode or two away etc. Eg from 1,2,3 = 1,2,, and 1,,3
877 for next_barcode_add in 1..(barcode_num - first_barcode_index) {
878 // Initiate the new barcode string
879 let mut double_barcode_string = String::new();
880 // Iterate over each comma separated column and insert the barcode if needed
881 for column_index in 0..barcode_num {
882 // If it is either the first or second barcode, add comma separated to the new string
883 if column_index == first_barcode_index {
884 double_barcode_string.push_str(barcode_split[first_barcode_index])
885 } else if column_index == (first_barcode_index + next_barcode_add) {
886 double_barcode_string
887 .push_str(barcode_split[first_barcode_index + next_barcode_add])
888 }
889 // If we are not on the last barcode, add a comma
890 if column_index != (barcode_num - 1) {
891 double_barcode_string.push(',')
892 }
893 }
894 // Insert 0 if the barcodes are not within the double_hashmap -> barcodes
895 // Then add one regardless
896 *self
897 .double_hashmap
898 .get_mut(sample_id)
899 .unwrap_or(&mut self.empty_count_hash.clone())
900 .entry(double_barcode_string)
901 .or_insert(0) += count;
902 }
903 }
904 }
905}
906
907impl Default for ResultsEnrichment {
908 fn default() -> Self {
909 Self::new()
910 }
911}