barcode_count/parse.rs
1use anyhow::{anyhow, Result};
2use regex::Captures;
3use std::{
4 collections::VecDeque,
5 fmt,
6 sync::{
7 atomic::{AtomicBool, Ordering},
8 Arc, Mutex,
9 },
10};
11
12use crate::info::{MaxSeqErrors, Results, SequenceErrors, SequenceFormat};
13use ahash::AHashSet;
14
15pub struct SequenceParser {
16 shared_mut_clone: SharedMutData,
17 sequence_errors_clone: SequenceErrors,
18 sequence_format_clone: SequenceFormat,
19 max_errors_clone: MaxSeqErrors,
20 sample_seqs: AHashSet<String>,
21 counted_barcode_seqs: Vec<AHashSet<String>>,
22 raw_sequence: RawSequenceRead,
23 barcode_groups: Vec<String>,
24 min_quality_score: f32,
25}
26
27impl SequenceParser {
28 pub fn new(
29 shared_mut_clone: SharedMutData,
30 sequence_errors_clone: SequenceErrors,
31 sequence_format_clone: SequenceFormat,
32 max_errors_clone: MaxSeqErrors,
33 sample_seqs: AHashSet<String>,
34 counted_barcode_seqs: Vec<AHashSet<String>>,
35 min_quality_score: f32,
36 ) -> Self {
37 let mut barcode_groups = Vec::new();
38 for x in 0..sequence_format_clone.barcode_num {
39 barcode_groups.push(format!("barcode{}", x + 1))
40 }
41 SequenceParser {
42 shared_mut_clone,
43 sequence_errors_clone,
44 sequence_format_clone,
45 max_errors_clone,
46 sample_seqs,
47 counted_barcode_seqs,
48 raw_sequence: RawSequenceRead::new(),
49 barcode_groups,
50 min_quality_score,
51 }
52 }
53 pub fn parse(&mut self) -> Result<()> {
54 // Loop until there are no sequences left to parse. These are fed into seq vec by the reader thread
55 loop {
56 if self.get_seqeunce()? {
57 if let Some(seq_match_result) = self.match_seq()? {
58 let barcode_string = seq_match_result.barcode_string();
59 // If there is a random barcode included
60 let added = self.shared_mut_clone.results.lock().unwrap().add_count(
61 &seq_match_result.sample_barcode,
62 seq_match_result.random_barcode.as_ref(),
63 barcode_string,
64 );
65 if added {
66 self.sequence_errors_clone.correct_match()
67 } else {
68 self.sequence_errors_clone.duplicated();
69 }
70 }
71 } else if self.shared_mut_clone.finished.load(Ordering::Relaxed) {
72 break;
73 }
74 }
75 Ok(())
76 }
77
78 fn get_seqeunce(&mut self) -> Result<bool> {
79 // Pop off the last sequence from the seq vec
80 if let Some(new_raw_sequence) = self.shared_mut_clone.seq.lock().unwrap().pop_back() {
81 self.raw_sequence = RawSequenceRead::unpack(new_raw_sequence)?;
82 Ok(true)
83 } else {
84 Ok(false)
85 }
86 }
87
88 /// Does a regex search and captures the barcodes. Returns a struct of the results.
89 fn match_seq(&mut self) -> Result<Option<SequenceMatchResult>> {
90 self.check_and_fix_consant_region();
91 // if the barcodes are found continue, else return None and record a constant region error
92 if let Some(barcodes) = self
93 .sequence_format_clone
94 .format_regex
95 .captures(&self.raw_sequence.sequence)
96 {
97 // If there was a minimum set for quality, check each barcode's quality
98 if self.min_quality_score > 0.0 {
99 if let Some(format_match) = self
100 .sequence_format_clone
101 .format_regex
102 .find(&self.raw_sequence.sequence)
103 {
104 let start = format_match.start();
105 if self.raw_sequence.low_quality(
106 self.min_quality_score,
107 &self.sequence_format_clone.regions_string,
108 start,
109 ) {
110 // If any are low qualty, add to the low quality count and return
111 self.sequence_errors_clone.low_quality_barcode();
112 return Ok(None);
113 }
114 } else {
115 return Err(anyhow!(
116 "Regex find failed after regex captures was successful"
117 ));
118 }
119 }
120
121 // Create a match results struct which tests the regex regions
122 let match_results = SequenceMatchResult::new(
123 barcodes,
124 &self.barcode_groups,
125 &self.counted_barcode_seqs,
126 self.max_errors_clone.max_barcode_errors(),
127 &self.sample_seqs,
128 self.max_errors_clone.max_sample_errors(),
129 );
130
131 // If the sample barcode was not found, record the error and return none so that the algorithm stops for this sequence
132 if match_results.sample_barcode_error {
133 self.sequence_errors_clone.sample_barcode_error();
134 return Ok(None);
135 }
136 // If any of the counted barcodes were not found, even with error handling, record the error and return none so that the algorithm stops for this sequence
137 if match_results.counted_barcode_error {
138 self.sequence_errors_clone.barcode_error();
139 return Ok(None);
140 }
141 // If all went well, return the match results struct
142 Ok(Some(match_results))
143 } else {
144 // If the constant region was not found, record the error and return None
145 self.sequence_errors_clone.constant_region_error();
146 Ok(None)
147 }
148 }
149
150 /// Checks the constant region of the sequence then finds the best fix if it is not found. Basically whether or not the regex search worked
151 fn check_and_fix_consant_region(&mut self) {
152 // If the regex search does not work, try to fix the constant region
153 if !self
154 .sequence_format_clone
155 .format_regex
156 .is_match(&self.raw_sequence.sequence)
157 {
158 self.raw_sequence.fix_constant_region(
159 &self.sequence_format_clone.format_string,
160 self.max_errors_clone.max_constant_errors(),
161 );
162 }
163 }
164}
165
166pub struct SharedMutData {
167 pub seq: Arc<Mutex<VecDeque<String>>>,
168 pub finished: Arc<AtomicBool>,
169 pub results: Arc<Mutex<Results>>,
170}
171
172impl SharedMutData {
173 pub fn new(
174 seq: Arc<Mutex<VecDeque<String>>>,
175 finished: Arc<AtomicBool>,
176 results: Arc<Mutex<Results>>,
177 ) -> Self {
178 SharedMutData {
179 seq,
180 finished,
181 results,
182 }
183 }
184
185 pub fn arc_clone(&self) -> SharedMutData {
186 let seq = Arc::clone(&self.seq);
187 let finished = Arc::clone(&self.finished);
188 let results = Arc::clone(&self.results);
189 SharedMutData {
190 seq,
191 finished,
192 results,
193 }
194 }
195}
196
197/// A struct to hold the raw sequencing information and transform it if there are sequencing errors
198#[derive(Clone)]
199pub struct RawSequenceRead {
200 description: String, // line 1 of fastq
201 pub sequence: String, // line 2 of fastq
202 add_description: String, // line 3 of fastq
203 quality_values: String, // line 4 of fastq
204}
205
206impl Default for RawSequenceRead {
207 fn default() -> Self {
208 Self::new()
209 }
210}
211
212impl RawSequenceRead {
213 pub fn new() -> Self {
214 RawSequenceRead {
215 description: String::new(),
216 sequence: String::new(),
217 add_description: String::new(),
218 quality_values: String::new(),
219 }
220 }
221
222 pub fn new_fill(
223 line_1: String,
224 line_2: String,
225 line_3: String,
226 line_4: String,
227 ) -> RawSequenceRead {
228 RawSequenceRead {
229 description: line_1,
230 sequence: line_2,
231 add_description: line_3,
232 quality_values: line_4,
233 }
234 }
235
236 pub fn add_line(&mut self, line_num: u16, line: String) -> Result<()> {
237 match line_num {
238 1 => self.description = line,
239 2 => self.sequence = line,
240 3 => self.add_description = line,
241 4 => self.quality_values = line,
242 _ => {
243 return Err(anyhow!(
244 "Too many new lines found within fastq read\nCurrent read\n{}\nCurrent line: {}",
245 self,
246 line
247 ))
248 }
249 }
250 Ok(())
251 }
252
253 pub fn pack(&self) -> String {
254 format!(
255 "{}\n{}\n{}\n{}",
256 self.description, self.sequence, self.add_description, self.quality_values
257 )
258 }
259
260 pub fn unpack(raw_string: String) -> Result<Self> {
261 let mut raw_sequence_read = RawSequenceRead::new();
262 for (line_num, line) in raw_string.split('\n').enumerate() {
263 let true_line = line_num as u16 + 1;
264 raw_sequence_read.add_line(true_line, line.to_string())?
265 }
266 Ok(raw_sequence_read)
267 }
268
269 /// Replaces the 'N's in the sequencing format with the barcodes to fix any sequencing errrors that would cause the regex search not to work
270 pub fn insert_barcodes_constant_region(&mut self, format_string: &str, best_sequence: String) {
271 // Start a new string to push to
272 let mut fixed_sequence = String::new();
273 // Push the correct constant region nucleotides. If the constant string has an N, push the nucleotides from the original
274 // sequence corresponding to the barcodes
275 for (old_char, new_char) in best_sequence.chars().zip(format_string.chars()) {
276 if new_char == 'N' {
277 fixed_sequence.push_str(&old_char.to_string());
278 } else {
279 fixed_sequence.push_str(&new_char.to_string());
280 }
281 }
282 self.sequence = fixed_sequence
283 }
284
285 /// Fixes the constant region by finding the closest match within the full seqeuence that has fewer than the max errors allowed,
286 /// then uses the format string to flip the barcodes into the 'N's and have a fixed constant region string
287 pub fn fix_constant_region(&mut self, format_string: &str, max_constant_errors: u16) {
288 // Find the region of the sequence that best matches the constant region. This is doen by iterating through the sequence
289 // Get the length difference between what was sequenced and the barcode region with constant regions
290 // This is to stop the iteration in the next step
291 let length_diff = self.sequence.len() - format_string.len();
292
293 // Create a vector of sequences the length of the constant region + barcodes to check for where the best match is located
294 let mut possible_seqs = Vec::new();
295 for index in 0..length_diff {
296 let possible_seq = self
297 .sequence
298 .chars()
299 .skip(index) // skip to where the current index is and take the next amount equal to the length of the constant region + barcodes
300 .take(format_string.len())
301 .collect::<String>();
302 // Add the new sequence to the vector of all possible matches
303 possible_seqs.push(possible_seq);
304 }
305 // Find the closest match within what was sequenced to the constant region
306 let best_sequence_option = fix_error(format_string, &possible_seqs, max_constant_errors);
307
308 if let Some(best_sequence) = best_sequence_option {
309 self.insert_barcodes_constant_region(format_string, best_sequence);
310 } else {
311 self.sequence = "".to_string();
312 }
313 }
314
315 /// Each DNA base read score within FASTQ is the ascii number - 33.
316 /// This returns the number scores associated with the ascii values
317 ///
318 /// Score Error Probability
319 /// 40 0.0001
320 /// 30 0.001
321 /// 20 0.01
322 /// 10 0.1
323 pub fn quality_scores(&self) -> Vec<u8> {
324 self.quality_values
325 .chars()
326 .map(|ch| ch as u8 - 33)
327 .collect::<Vec<u8>>()
328 }
329
330 /// Test for if any of the barcode average quality score falls below the min_average cutoff
331 pub fn low_quality(
332 &self,
333 min_average: f32,
334 barcode_indicator_string: &str,
335 start: usize,
336 ) -> bool {
337 let mut scores = Vec::new(); // vec to hold the quality scores for each barcode
338 let mut previous_type = '\0'; // setup previoius barcode inidator type for the first comparison
339
340 for (score, seq_type) in self
341 .quality_scores()
342 .iter()
343 .skip(start)
344 .zip(barcode_indicator_string.chars())
345 // Zip score and barcode indicator
346 {
347 // Check for change in barcode/sequence type to know to calculate average score and create a new start to scores
348 if seq_type != previous_type {
349 // If scores is empty. This avoids if there is a transition from consant region to barcode. Constant region is not calculated
350 if !scores.is_empty() {
351 // Get the average quality score for the barcode and if it is below the max average return true for low_quality
352 let sum: f32 = scores.iter().sum();
353 let average_score: f32 = sum / scores.len() as f32;
354 if average_score < min_average {
355 return true;
356 }
357 // Start a new vec for the next barcode
358 scores = Vec::new();
359 }
360 // set previous indicator type to the current type to check next
361 previous_type = seq_type;
362 // if indicator is not for a constant region, create a new vec with the new score
363 if seq_type != 'C' {
364 scores = vec![*score as f32];
365 }
366 } else {
367 // If indicator type is not for a constant region, add the score to the vec
368 if seq_type != 'C' {
369 scores.push(*score as f32);
370 }
371 }
372 }
373 // If no average scores cause a true return, then return low_quality as false
374 false
375 }
376
377 pub fn check_fastq_format(&self) -> Result<()> {
378 // Test to see if the first line is not a sequence and the second is a sequence, which is typical fastq format
379 match test_sequence(&self.description) {
380 LineType::Sequence => {
381 println!("{}", self);
382 return Err(anyhow!("The first line within the FASTQ contains DNA sequences. Check the FASTQ format"));
383 }
384 LineType::Metadata => (),
385 }
386 match test_sequence(&self.sequence) {
387 LineType::Sequence => (),
388 LineType::Metadata => {
389 println!("{}", self);
390 return Err(anyhow!("The second line within the FASTQ file is not a sequence. Check the FASTQ format"));
391 }
392 }
393 Ok(())
394 }
395}
396
397impl fmt::Display for RawSequenceRead {
398 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
399 write!(
400 f,
401 "Line 1: {}\nLine 2: {}\nLine 3: {}\nLine 4: {}",
402 self.description, self.sequence, self.add_description, self.quality_values
403 )
404 }
405}
406
407/// An enum of linetype to use with test_sequence. Every line of a FASTQ should either be sequence or metadata for the sequence.
408enum LineType {
409 Sequence,
410 Metadata,
411}
412
413/// Tests whether a line within the file String is a sequence by checking if over half of the line contains DNA neceotide letters
414fn test_sequence(sequence: &str) -> LineType {
415 let sequence_length = sequence.len(); // the the length of the line
416 let adenines = sequence.matches('A').count(); // And the amount of each DNA nucleotide
417 let guanines = sequence.matches('G').count();
418 let cytosines = sequence.matches('C').count();
419 let thymines = sequence.matches('T').count();
420 let any = sequence.matches('N').count();
421 let total_dna = adenines + guanines + cytosines + thymines + any;
422 // Check if less than half of the line contains DNA nucleotides. If so, return that the line is metadata. Otherwise, a sequence
423 if total_dna < sequence_length / 2 {
424 return LineType::Metadata;
425 }
426 LineType::Sequence
427}
428
429/// A struct to hold the results of the regex search on the sequence along with perform the functions to fix and find
430pub struct SequenceMatchResult {
431 pub sample_barcode: String,
432 pub counted_barcodes: Vec<String>,
433 pub counted_barcode_error: bool,
434 pub sample_barcode_error: bool,
435 pub random_barcode: Option<String>,
436}
437
438impl SequenceMatchResult {
439 pub fn new(
440 barcodes: Captures, // The regex result on the sequence
441 barcode_groups: &[String],
442 counted_barcode_seqs: &[AHashSet<String>], // The vec of known counted barcode sequences in order to fix sequencing errors. Will be empty if none are known or included
443 counted_barcode_max_errors: &[u16], // The maximum errors allowed for each counted barcode
444 sample_seqs: &AHashSet<String>, // A hashset of all known sample barcodes. Will be empty if none are known or included
445 sample_seqs_max_errors: u16, // Maximum allowed sample barcode sequencing errors
446 ) -> SequenceMatchResult {
447 // Check for sample barcode and start with setting error to false
448 let mut sample_barcode_error = false;
449 let sample_barcode;
450 // If 'sample' is within the regex returned search continue with checking and fixing
451 if let Some(sample_barcode_match) = barcodes.name("sample") {
452 let sample_barcode_str = sample_barcode_match.as_str();
453 if sample_seqs.is_empty() {
454 sample_barcode = sample_barcode_str.to_string();
455 } else {
456 // If the sample barcode is known save it
457 if sample_seqs.contains(sample_barcode_str) {
458 sample_barcode = sample_barcode_str.to_string();
459 } else {
460 // Otherwise try and fix it. If the fix returns none, then save the error and an empty string
461 let sample_barcode_fix_option =
462 fix_error(sample_barcode_str, sample_seqs, sample_seqs_max_errors);
463 if let Some(fixed_barcode) = sample_barcode_fix_option {
464 sample_barcode = fixed_barcode;
465 } else {
466 sample_barcode = String::new();
467 sample_barcode_error = true;
468 }
469 }
470 }
471 } else {
472 // If there was no sample, save an empty string which should not have any allocation
473 sample_barcode = "barcode".to_string();
474 }
475
476 // Check the counted barcodes and start with setting the error to false
477 let mut counted_barcode_error = false;
478 // Create an empty vec to hold the barcodes
479 let mut counted_barcodes = Vec::new();
480 // Only continue if the sample barcode was found
481 if !sample_barcode_error {
482 // Iterate through the counted barcocdes. Fix if they are not within the known barcodes
483 for (index, barcode_group) in barcode_groups.iter().enumerate() {
484 let mut counted_barcode =
485 barcodes.name(barcode_group).unwrap().as_str().to_string();
486 // If a barcode conversion file was included and there are known barcodes, check for sequencing errors
487 if !counted_barcode_seqs.is_empty() {
488 // If the barcode is not known, try and fix
489 if !counted_barcode_seqs[index].contains(&counted_barcode) {
490 let barcode_seq_fix_option = fix_error(
491 &counted_barcode,
492 &counted_barcode_seqs[index],
493 counted_barcode_max_errors[index],
494 );
495 if let Some(fixed_barcode) = barcode_seq_fix_option {
496 counted_barcode = fixed_barcode;
497 } else {
498 // If a fix was not found, return the error and stop going through more barcodes
499 counted_barcode_error = true;
500 break;
501 }
502 }
503 }
504 // If all is well, add the counted barcode to the vec
505 counted_barcodes.push(counted_barcode);
506 }
507 }
508
509 // Chceck for a random barcode
510 let random_barcode;
511 // If a random barcode exists, add it. Otherwise set it to an empty string
512 if let Some(random_barcode_match) = barcodes.name("random") {
513 random_barcode = Some(random_barcode_match.as_str().to_string())
514 } else {
515 random_barcode = None
516 }
517 SequenceMatchResult {
518 sample_barcode,
519 counted_barcodes,
520 counted_barcode_error,
521 sample_barcode_error,
522 random_barcode,
523 }
524 }
525
526 /// Returns a comma separated counted barcodes string. Perfect for CSV file writing
527 pub fn barcode_string(&self) -> String {
528 self.counted_barcodes.join(",")
529 }
530}
531
532/// Fix an error in a sequence by comparing it to all possible sequences. If no sequence matches with fewer or equal to the number of mismatches 'None' is returned.
533/// 'None' is also returned if two or more sequences are best matches. Will work with vec and hashset
534///
535/// # Example
536///
537/// ```
538/// use barcode_count::parse::fix_error;
539///
540/// let barcode = "AGTAG";
541///
542/// let possible_barcodes_one_match: std::collections::AHashSet<String> = ["AGCAG".to_string(), "ACAAG".to_string(), "AGCAA".to_string()].iter().cloned().collect(); // only the first has a single mismatch
543/// let possible_barcodes_two_match: std::collections::AHashSet<String> = ["AGCAG".to_string(), "AGAAG".to_string(), "AGCAA".to_string()].iter().cloned().collect(); // first and second have a single mismatch
544///
545/// let max_mismatches = barcode.chars().count() as u16 / 5; // allow up to 20% mismatches
546///
547/// let fixed_error_one = fix_error(barcode, &possible_barcodes_one_match, max_mismatches);
548/// let fixed_error_two = fix_error(barcode, &possible_barcodes_two_match, max_mismatches);
549///
550/// assert_eq!(fixed_error_one, Some("AGCAG".to_string()));
551/// assert_eq!(fixed_error_two, None);
552/// ```
553pub fn fix_error<'a, I>(mismatch_seq: &str, possible_seqs: I, mismatches: u16) -> Option<String>
554where
555 I: IntoIterator<Item = &'a String>,
556{
557 let mut best_match = None; // start the best match with None
558 let mut best_mismatch_count = mismatches + 1; // Add 1 and start the best. This allows a match with the same mismatches as required
559 let mut keep = true; // An initiated variable to check if there is more than one best match
560
561 // Iterate through possible matches
562 for true_seq in possible_seqs {
563 // Initiate the number of mismatches for the current iterated possible sequece match
564 let mut mismatches = 0;
565
566 // Iterate through the nucleotides of the possible match and the sequence to be fixed finding how many mismatches
567 // If the mismatches exceed the current best mismatched, end this early
568 for (possible_char, current_char) in true_seq.chars().zip(mismatch_seq.chars()) {
569 if possible_char != current_char && current_char != 'N' && possible_char != 'N' {
570 mismatches += 1;
571 }
572 if mismatches > best_mismatch_count {
573 break;
574 }
575 }
576 // If there are more than one best match, don't keep
577 if mismatches == best_mismatch_count {
578 keep = false
579 }
580 // If this is the best match, keep and reset best mismatches to this value
581 if mismatches < best_mismatch_count {
582 keep = true;
583 best_mismatch_count = mismatches;
584 best_match = Some(true_seq.to_string());
585 }
586 }
587 // If there is one best match and it is some, return it. Otherwise return None
588 if keep && best_match.is_some() {
589 best_match
590 } else {
591 None
592 }
593}