barcode_count/
parse.rs

1use anyhow::{anyhow, Result};
2use regex::Captures;
3use std::{
4    collections::VecDeque,
5    fmt,
6    sync::{
7        atomic::{AtomicBool, Ordering},
8        Arc, Mutex,
9    },
10};
11
12use crate::info::{MaxSeqErrors, Results, SequenceErrors, SequenceFormat};
13use ahash::AHashSet;
14
15pub struct SequenceParser {
16    shared_mut_clone: SharedMutData,
17    sequence_errors_clone: SequenceErrors,
18    sequence_format_clone: SequenceFormat,
19    max_errors_clone: MaxSeqErrors,
20    sample_seqs: AHashSet<String>,
21    counted_barcode_seqs: Vec<AHashSet<String>>,
22    raw_sequence: RawSequenceRead,
23    barcode_groups: Vec<String>,
24    min_quality_score: f32,
25}
26
27impl SequenceParser {
28    pub fn new(
29        shared_mut_clone: SharedMutData,
30        sequence_errors_clone: SequenceErrors,
31        sequence_format_clone: SequenceFormat,
32        max_errors_clone: MaxSeqErrors,
33        sample_seqs: AHashSet<String>,
34        counted_barcode_seqs: Vec<AHashSet<String>>,
35        min_quality_score: f32,
36    ) -> Self {
37        let mut barcode_groups = Vec::new();
38        for x in 0..sequence_format_clone.barcode_num {
39            barcode_groups.push(format!("barcode{}", x + 1))
40        }
41        SequenceParser {
42            shared_mut_clone,
43            sequence_errors_clone,
44            sequence_format_clone,
45            max_errors_clone,
46            sample_seqs,
47            counted_barcode_seqs,
48            raw_sequence: RawSequenceRead::new(),
49            barcode_groups,
50            min_quality_score,
51        }
52    }
53    pub fn parse(&mut self) -> Result<()> {
54        // Loop until there are no sequences left to parse.  These are fed into seq vec by the reader thread
55        loop {
56            if self.get_seqeunce()? {
57                if let Some(seq_match_result) = self.match_seq()? {
58                    let barcode_string = seq_match_result.barcode_string();
59                    // If there is a random barcode included
60                    let added = self.shared_mut_clone.results.lock().unwrap().add_count(
61                        &seq_match_result.sample_barcode,
62                        seq_match_result.random_barcode.as_ref(),
63                        barcode_string,
64                    );
65                    if added {
66                        self.sequence_errors_clone.correct_match()
67                    } else {
68                        self.sequence_errors_clone.duplicated();
69                    }
70                }
71            } else if self.shared_mut_clone.finished.load(Ordering::Relaxed) {
72                break;
73            }
74        }
75        Ok(())
76    }
77
78    fn get_seqeunce(&mut self) -> Result<bool> {
79        // Pop off the last sequence from the seq vec
80        if let Some(new_raw_sequence) = self.shared_mut_clone.seq.lock().unwrap().pop_back() {
81            self.raw_sequence = RawSequenceRead::unpack(new_raw_sequence)?;
82            Ok(true)
83        } else {
84            Ok(false)
85        }
86    }
87
88    /// Does a regex search and captures the barcodes.  Returns a struct of the results.  
89    fn match_seq(&mut self) -> Result<Option<SequenceMatchResult>> {
90        self.check_and_fix_consant_region();
91        // if the barcodes are found continue, else return None and record a constant region error
92        if let Some(barcodes) = self
93            .sequence_format_clone
94            .format_regex
95            .captures(&self.raw_sequence.sequence)
96        {
97            // If there was a minimum set for quality, check each barcode's quality
98            if self.min_quality_score > 0.0 {
99                if let Some(format_match) = self
100                    .sequence_format_clone
101                    .format_regex
102                    .find(&self.raw_sequence.sequence)
103                {
104                    let start = format_match.start();
105                    if self.raw_sequence.low_quality(
106                        self.min_quality_score,
107                        &self.sequence_format_clone.regions_string,
108                        start,
109                    ) {
110                        // If any are low qualty, add to the low quality count and return
111                        self.sequence_errors_clone.low_quality_barcode();
112                        return Ok(None);
113                    }
114                } else {
115                    return Err(anyhow!(
116                        "Regex find failed after regex captures was successful"
117                    ));
118                }
119            }
120
121            // Create a match results struct which tests the regex regions
122            let match_results = SequenceMatchResult::new(
123                barcodes,
124                &self.barcode_groups,
125                &self.counted_barcode_seqs,
126                self.max_errors_clone.max_barcode_errors(),
127                &self.sample_seqs,
128                self.max_errors_clone.max_sample_errors(),
129            );
130
131            // If the sample barcode was not found, record the error and return none so that the algorithm stops for this sequence
132            if match_results.sample_barcode_error {
133                self.sequence_errors_clone.sample_barcode_error();
134                return Ok(None);
135            }
136            // If any of the counted barcodes were not found, even with error handling, record the error and return none so that the algorithm stops for this sequence
137            if match_results.counted_barcode_error {
138                self.sequence_errors_clone.barcode_error();
139                return Ok(None);
140            }
141            // If all went well, return the match results struct
142            Ok(Some(match_results))
143        } else {
144            // If the constant region was not found, record the error and return None
145            self.sequence_errors_clone.constant_region_error();
146            Ok(None)
147        }
148    }
149
150    /// Checks the constant region of the sequence then finds the best fix if it is not found.  Basically whether or not the regex search worked
151    fn check_and_fix_consant_region(&mut self) {
152        // If the regex search does not work, try to fix the constant region
153        if !self
154            .sequence_format_clone
155            .format_regex
156            .is_match(&self.raw_sequence.sequence)
157        {
158            self.raw_sequence.fix_constant_region(
159                &self.sequence_format_clone.format_string,
160                self.max_errors_clone.max_constant_errors(),
161            );
162        }
163    }
164}
165
166pub struct SharedMutData {
167    pub seq: Arc<Mutex<VecDeque<String>>>,
168    pub finished: Arc<AtomicBool>,
169    pub results: Arc<Mutex<Results>>,
170}
171
172impl SharedMutData {
173    pub fn new(
174        seq: Arc<Mutex<VecDeque<String>>>,
175        finished: Arc<AtomicBool>,
176        results: Arc<Mutex<Results>>,
177    ) -> Self {
178        SharedMutData {
179            seq,
180            finished,
181            results,
182        }
183    }
184
185    pub fn arc_clone(&self) -> SharedMutData {
186        let seq = Arc::clone(&self.seq);
187        let finished = Arc::clone(&self.finished);
188        let results = Arc::clone(&self.results);
189        SharedMutData {
190            seq,
191            finished,
192            results,
193        }
194    }
195}
196
197/// A struct to hold the raw sequencing information and transform it if there are sequencing errors
198#[derive(Clone)]
199pub struct RawSequenceRead {
200    description: String,     // line 1 of fastq
201    pub sequence: String,    // line 2 of fastq
202    add_description: String, // line 3 of fastq
203    quality_values: String,  // line 4 of fastq
204}
205
206impl Default for RawSequenceRead {
207    fn default() -> Self {
208        Self::new()
209    }
210}
211
212impl RawSequenceRead {
213    pub fn new() -> Self {
214        RawSequenceRead {
215            description: String::new(),
216            sequence: String::new(),
217            add_description: String::new(),
218            quality_values: String::new(),
219        }
220    }
221
222    pub fn new_fill(
223        line_1: String,
224        line_2: String,
225        line_3: String,
226        line_4: String,
227    ) -> RawSequenceRead {
228        RawSequenceRead {
229            description: line_1,
230            sequence: line_2,
231            add_description: line_3,
232            quality_values: line_4,
233        }
234    }
235
236    pub fn add_line(&mut self, line_num: u16, line: String) -> Result<()> {
237        match line_num {
238            1 => self.description = line,
239            2 => self.sequence = line,
240            3 => self.add_description = line,
241            4 => self.quality_values = line,
242            _ => {
243                return Err(anyhow!(
244                "Too many new lines found within fastq read\nCurrent read\n{}\nCurrent line: {}",
245                self,
246                line
247            ))
248            }
249        }
250        Ok(())
251    }
252
253    pub fn pack(&self) -> String {
254        format!(
255            "{}\n{}\n{}\n{}",
256            self.description, self.sequence, self.add_description, self.quality_values
257        )
258    }
259
260    pub fn unpack(raw_string: String) -> Result<Self> {
261        let mut raw_sequence_read = RawSequenceRead::new();
262        for (line_num, line) in raw_string.split('\n').enumerate() {
263            let true_line = line_num as u16 + 1;
264            raw_sequence_read.add_line(true_line, line.to_string())?
265        }
266        Ok(raw_sequence_read)
267    }
268
269    /// Replaces the 'N's in the sequencing format with the barcodes to fix any sequencing errrors that would cause the regex search not to work
270    pub fn insert_barcodes_constant_region(&mut self, format_string: &str, best_sequence: String) {
271        // Start a new string to push to
272        let mut fixed_sequence = String::new();
273        // Push the correct constant region nucleotides.  If the constant string has an N, push the nucleotides from the original
274        // sequence corresponding to the barcodes
275        for (old_char, new_char) in best_sequence.chars().zip(format_string.chars()) {
276            if new_char == 'N' {
277                fixed_sequence.push_str(&old_char.to_string());
278            } else {
279                fixed_sequence.push_str(&new_char.to_string());
280            }
281        }
282        self.sequence = fixed_sequence
283    }
284
285    /// Fixes the constant region by finding the closest match within the full seqeuence that has fewer than the max errors allowed,
286    /// then uses the format string to flip the barcodes into the 'N's and have a fixed constant region string
287    pub fn fix_constant_region(&mut self, format_string: &str, max_constant_errors: u16) {
288        // Find the region of the sequence that best matches the constant region.  This is doen by iterating through the sequence
289        // Get the length difference between what was sequenced and the barcode region with constant regions
290        // This is to stop the iteration in the next step
291        let length_diff = self.sequence.len() - format_string.len();
292
293        // Create a vector of sequences the length of the constant region + barcodes to check for where the best match is located
294        let mut possible_seqs = Vec::new();
295        for index in 0..length_diff {
296            let possible_seq = self
297                .sequence
298                .chars()
299                .skip(index) // skip to where the current index is and take the next amount equal to the length of the constant region + barcodes
300                .take(format_string.len())
301                .collect::<String>();
302            // Add the new sequence to the vector of all possible matches
303            possible_seqs.push(possible_seq);
304        }
305        // Find the closest match within what was sequenced to the constant region
306        let best_sequence_option = fix_error(format_string, &possible_seqs, max_constant_errors);
307
308        if let Some(best_sequence) = best_sequence_option {
309            self.insert_barcodes_constant_region(format_string, best_sequence);
310        } else {
311            self.sequence = "".to_string();
312        }
313    }
314
315    /// Each DNA base read score within FASTQ is the ascii number - 33.
316    /// This returns the number scores associated with the ascii values
317    ///
318    /// Score    Error Probability
319    /// 40       0.0001
320    /// 30       0.001
321    /// 20       0.01
322    /// 10       0.1
323    pub fn quality_scores(&self) -> Vec<u8> {
324        self.quality_values
325            .chars()
326            .map(|ch| ch as u8 - 33)
327            .collect::<Vec<u8>>()
328    }
329
330    /// Test for if any of the barcode average quality score falls below the min_average cutoff
331    pub fn low_quality(
332        &self,
333        min_average: f32,
334        barcode_indicator_string: &str,
335        start: usize,
336    ) -> bool {
337        let mut scores = Vec::new(); // vec to hold the quality scores for each barcode
338        let mut previous_type = '\0'; // setup previoius barcode inidator type for the first comparison
339
340        for (score, seq_type) in self
341            .quality_scores()
342            .iter()
343            .skip(start)
344            .zip(barcode_indicator_string.chars())
345        // Zip score and barcode indicator
346        {
347            // Check for change in barcode/sequence type to know to calculate average score and create a new start to scores
348            if seq_type != previous_type {
349                // If scores is empty.  This avoids if there is a transition from consant region to barcode.  Constant region is not calculated
350                if !scores.is_empty() {
351                    // Get the average quality score for the barcode and if it is below the max average return true for low_quality
352                    let sum: f32 = scores.iter().sum();
353                    let average_score: f32 = sum / scores.len() as f32;
354                    if average_score < min_average {
355                        return true;
356                    }
357                    // Start a new vec for the next barcode
358                    scores = Vec::new();
359                }
360                // set previous indicator type to the current type to check next
361                previous_type = seq_type;
362                // if indicator is not for a constant region, create a new vec with the new score
363                if seq_type != 'C' {
364                    scores = vec![*score as f32];
365                }
366            } else {
367                // If indicator type is not for a constant region, add the score to the vec
368                if seq_type != 'C' {
369                    scores.push(*score as f32);
370                }
371            }
372        }
373        // If no average scores cause a true return, then return low_quality as false
374        false
375    }
376
377    pub fn check_fastq_format(&self) -> Result<()> {
378        // Test to see if the first line is not a sequence and the second is a sequence, which is typical fastq format
379        match test_sequence(&self.description) {
380            LineType::Sequence => {
381                println!("{}", self);
382                return Err(anyhow!("The first line within the FASTQ contains DNA sequences.  Check the FASTQ format"));
383            }
384            LineType::Metadata => (),
385        }
386        match test_sequence(&self.sequence) {
387            LineType::Sequence => (),
388            LineType::Metadata => {
389                println!("{}", self);
390                return Err(anyhow!("The second line within the FASTQ file is not a sequence. Check the FASTQ format"));
391            }
392        }
393        Ok(())
394    }
395}
396
397impl fmt::Display for RawSequenceRead {
398    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
399        write!(
400            f,
401            "Line 1: {}\nLine 2: {}\nLine 3: {}\nLine 4: {}",
402            self.description, self.sequence, self.add_description, self.quality_values
403        )
404    }
405}
406
407/// An enum of linetype to use with test_sequence.  Every line of a FASTQ should either be sequence or metadata for the sequence.
408enum LineType {
409    Sequence,
410    Metadata,
411}
412
413/// Tests whether a line within the file String is a sequence by checking if over half of the line contains DNA neceotide letters
414fn test_sequence(sequence: &str) -> LineType {
415    let sequence_length = sequence.len(); // the the length of the line
416    let adenines = sequence.matches('A').count(); // And the amount of each DNA nucleotide
417    let guanines = sequence.matches('G').count();
418    let cytosines = sequence.matches('C').count();
419    let thymines = sequence.matches('T').count();
420    let any = sequence.matches('N').count();
421    let total_dna = adenines + guanines + cytosines + thymines + any;
422    // Check if less than half of the line contains DNA nucleotides.  If so, return that the line is metadata.  Otherwise, a sequence
423    if total_dna < sequence_length / 2 {
424        return LineType::Metadata;
425    }
426    LineType::Sequence
427}
428
429/// A struct to hold the results of the regex search on the sequence along with perform the functions to fix and find
430pub struct SequenceMatchResult {
431    pub sample_barcode: String,
432    pub counted_barcodes: Vec<String>,
433    pub counted_barcode_error: bool,
434    pub sample_barcode_error: bool,
435    pub random_barcode: Option<String>,
436}
437
438impl SequenceMatchResult {
439    pub fn new(
440        barcodes: Captures, // The regex result on the sequence
441        barcode_groups: &[String],
442        counted_barcode_seqs: &[AHashSet<String>], // The vec of known counted barcode sequences in order to fix sequencing errors.  Will be empty if none are known or included
443        counted_barcode_max_errors: &[u16], // The maximum errors allowed for each counted barcode
444        sample_seqs: &AHashSet<String>, // A hashset of all known sample barcodes. Will be empty if none are known or included
445        sample_seqs_max_errors: u16,    // Maximum allowed sample barcode sequencing errors
446    ) -> SequenceMatchResult {
447        // Check for sample barcode and start with setting error to false
448        let mut sample_barcode_error = false;
449        let sample_barcode;
450        // If 'sample' is within the regex returned search continue with checking and fixing
451        if let Some(sample_barcode_match) = barcodes.name("sample") {
452            let sample_barcode_str = sample_barcode_match.as_str();
453            if sample_seqs.is_empty() {
454                sample_barcode = sample_barcode_str.to_string();
455            } else {
456                // If the sample barcode is known save it
457                if sample_seqs.contains(sample_barcode_str) {
458                    sample_barcode = sample_barcode_str.to_string();
459                } else {
460                    // Otherwise try and fix it.  If the fix returns none, then save the error and an empty string
461                    let sample_barcode_fix_option =
462                        fix_error(sample_barcode_str, sample_seqs, sample_seqs_max_errors);
463                    if let Some(fixed_barcode) = sample_barcode_fix_option {
464                        sample_barcode = fixed_barcode;
465                    } else {
466                        sample_barcode = String::new();
467                        sample_barcode_error = true;
468                    }
469                }
470            }
471        } else {
472            // If there was no sample, save an empty string which should not have any allocation
473            sample_barcode = "barcode".to_string();
474        }
475
476        // Check the counted barcodes and start with setting the error to false
477        let mut counted_barcode_error = false;
478        // Create an empty vec to hold the barcodes
479        let mut counted_barcodes = Vec::new();
480        // Only continue if the sample barcode was found
481        if !sample_barcode_error {
482            // Iterate through the counted barcocdes.  Fix if they are not within the known barcodes
483            for (index, barcode_group) in barcode_groups.iter().enumerate() {
484                let mut counted_barcode =
485                    barcodes.name(barcode_group).unwrap().as_str().to_string();
486                // If a barcode conversion file was included and there are known barcodes, check for sequencing errors
487                if !counted_barcode_seqs.is_empty() {
488                    // If the barcode is not known, try and fix
489                    if !counted_barcode_seqs[index].contains(&counted_barcode) {
490                        let barcode_seq_fix_option = fix_error(
491                            &counted_barcode,
492                            &counted_barcode_seqs[index],
493                            counted_barcode_max_errors[index],
494                        );
495                        if let Some(fixed_barcode) = barcode_seq_fix_option {
496                            counted_barcode = fixed_barcode;
497                        } else {
498                            // If a fix was not found, return the error and stop going through more barcodes
499                            counted_barcode_error = true;
500                            break;
501                        }
502                    }
503                }
504                // If all is well, add the counted barcode to the vec
505                counted_barcodes.push(counted_barcode);
506            }
507        }
508
509        // Chceck for a random barcode
510        let random_barcode;
511        // If a random barcode exists, add it.  Otherwise set it to an empty string
512        if let Some(random_barcode_match) = barcodes.name("random") {
513            random_barcode = Some(random_barcode_match.as_str().to_string())
514        } else {
515            random_barcode = None
516        }
517        SequenceMatchResult {
518            sample_barcode,
519            counted_barcodes,
520            counted_barcode_error,
521            sample_barcode_error,
522            random_barcode,
523        }
524    }
525
526    /// Returns a comma separated counted barcodes string.  Perfect for CSV file writing
527    pub fn barcode_string(&self) -> String {
528        self.counted_barcodes.join(",")
529    }
530}
531
532/// Fix an error in a sequence by comparing it to all possible sequences.  If no sequence matches with fewer or equal to the number of mismatches 'None' is returned.
533/// 'None' is also returned if two or more sequences are best matches.  Will work with vec and hashset
534///
535/// # Example
536///
537/// ```
538/// use barcode_count::parse::fix_error;
539///
540/// let barcode = "AGTAG";
541///
542/// let possible_barcodes_one_match: std::collections::AHashSet<String> = ["AGCAG".to_string(), "ACAAG".to_string(), "AGCAA".to_string()].iter().cloned().collect(); // only the first has a single mismatch
543/// let possible_barcodes_two_match: std::collections::AHashSet<String> = ["AGCAG".to_string(), "AGAAG".to_string(), "AGCAA".to_string()].iter().cloned().collect(); // first and second have a single mismatch
544///
545/// let max_mismatches = barcode.chars().count() as u16 / 5; // allow up to 20% mismatches
546///
547/// let fixed_error_one = fix_error(barcode, &possible_barcodes_one_match, max_mismatches);
548/// let fixed_error_two = fix_error(barcode, &possible_barcodes_two_match, max_mismatches);
549///
550/// assert_eq!(fixed_error_one, Some("AGCAG".to_string()));
551/// assert_eq!(fixed_error_two, None);
552/// ```
553pub fn fix_error<'a, I>(mismatch_seq: &str, possible_seqs: I, mismatches: u16) -> Option<String>
554where
555    I: IntoIterator<Item = &'a String>,
556{
557    let mut best_match = None; // start the best match with None
558    let mut best_mismatch_count = mismatches + 1; // Add 1 and start the best.  This allows a match with the same mismatches as required
559    let mut keep = true; // An initiated variable to check if there is more than one best match
560
561    // Iterate through possible matches
562    for true_seq in possible_seqs {
563        // Initiate the number of mismatches for the current iterated possible sequece match
564        let mut mismatches = 0;
565
566        // Iterate through the nucleotides of the possible match and the sequence to be fixed finding how many mismatches
567        // If the mismatches exceed the current best mismatched, end this early
568        for (possible_char, current_char) in true_seq.chars().zip(mismatch_seq.chars()) {
569            if possible_char != current_char && current_char != 'N' && possible_char != 'N' {
570                mismatches += 1;
571            }
572            if mismatches > best_mismatch_count {
573                break;
574            }
575        }
576        // If there are more than one best match, don't keep
577        if mismatches == best_mismatch_count {
578            keep = false
579        }
580        // If this is the best match, keep and reset best mismatches to this value
581        if mismatches < best_mismatch_count {
582            keep = true;
583            best_mismatch_count = mismatches;
584            best_match = Some(true_seq.to_string());
585        }
586    }
587    // If there is one best match and it is some, return it.  Otherwise return None
588    if keep && best_match.is_some() {
589        best_match
590    } else {
591        None
592    }
593}