1use bitvec::prelude::*;
2use std::ops::Range;
3
4use crate::color::{color_background, color_head};
5use crate::color::{COLOR_BACKGROUND, COLOR_BASES, COLOR_QUALS};
6use crate::reverse_complement;
7use crate::DNA_BASES;
8use anyhow::{bail, Context, Result};
9use bstr::ByteSlice;
10use regex::bytes::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
11use seq_io::fastq::{OwnedRecord, Record};
12
13#[derive(Copy, Clone, Debug)]
15pub struct MatcherOpts {
16 pub invert_match: bool,
18 pub reverse_complement: bool,
21 pub color: bool,
23}
24
25fn to_bitvec(ranges: impl Iterator<Item = Range<usize>>, len: usize) -> BitVec {
27 let mut vec = bitvec![0; len];
28 ranges.for_each(|range| {
29 for index in range {
30 vec.set(index, true);
31 }
32 });
33 vec
34}
35
36fn bases_colored(
39 bases: &[u8],
40 quals: &[u8],
41 ranges: impl Iterator<Item = Range<usize>>,
42) -> (Vec<u8>, Vec<u8>) {
43 let mut colored_bases = Vec::with_capacity(bases.len());
45 let mut colored_quals = Vec::with_capacity(bases.len());
46
47 let bits = to_bitvec(ranges, bases.len());
49
50 let mut last_color_on = false;
53 let mut last_bases_index = 0;
54 let mut cur_bases_index = 0;
55 for base_color_on in bits.iter() {
56 if *base_color_on {
57 if !last_color_on {
59 if last_bases_index + 1 < cur_bases_index {
61 COLOR_BACKGROUND
62 .paint(&bases[last_bases_index..cur_bases_index])
63 .write_to(&mut colored_bases)
64 .unwrap();
65 COLOR_BACKGROUND
66 .paint(&quals[last_bases_index..cur_bases_index])
67 .write_to(&mut colored_quals)
68 .unwrap();
69 }
70 last_bases_index = cur_bases_index;
72 }
73
74 last_color_on = true;
75 } else {
76 if last_color_on {
78 if last_bases_index + 1 < cur_bases_index {
80 COLOR_BASES
81 .paint(&bases[last_bases_index..cur_bases_index])
82 .write_to(&mut colored_bases)
83 .unwrap();
84 COLOR_QUALS
85 .paint(&quals[last_bases_index..cur_bases_index])
86 .write_to(&mut colored_quals)
87 .unwrap();
88 }
89 last_bases_index = cur_bases_index;
91 }
92 last_color_on = false;
93 }
94 cur_bases_index += 1;
95 }
96 if last_bases_index + 1 < cur_bases_index {
98 if last_color_on {
99 COLOR_BASES
100 .paint(&bases[last_bases_index..cur_bases_index])
101 .write_to(&mut colored_bases)
102 .unwrap();
103 COLOR_QUALS
104 .paint(&quals[last_bases_index..cur_bases_index])
105 .write_to(&mut colored_quals)
106 .unwrap();
107 } else {
108 COLOR_BACKGROUND
109 .paint(&bases[last_bases_index..cur_bases_index])
110 .write_to(&mut colored_bases)
111 .unwrap();
112 COLOR_BACKGROUND
113 .paint(&quals[last_bases_index..cur_bases_index])
114 .write_to(&mut colored_quals)
115 .unwrap();
116 }
117 }
118
119 (colored_bases, colored_quals)
120}
121
122pub fn validate_fixed_pattern(pattern: &str) -> Result<()> {
124 for (index, base) in pattern.chars().enumerate() {
125 if !DNA_BASES.contains(&(base as u8)) {
126 bail!(
127 "Fixed pattern must contain only DNA bases: {} .. [{}] .. {}",
128 &pattern[0..index],
129 &pattern[index..=index],
130 &pattern[index + 1..],
131 )
132 }
133 }
134 Ok(())
135}
136
137pub trait Matcher {
139 fn opts(&self) -> MatcherOpts;
141
142 fn bases_match(&self, bases: &[u8]) -> bool;
144
145 fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>);
149
150 fn read_match(&self, read: &mut OwnedRecord) -> bool {
152 let match_found = if self.opts().invert_match {
153 self.bases_match(read.seq())
154 && (!self.opts().reverse_complement
155 || self.bases_match(&reverse_complement(read.seq())))
156 } else {
157 self.bases_match(read.seq())
158 || (self.opts().reverse_complement
159 && self.bases_match(&reverse_complement(read.seq())))
160 };
161
162 if self.opts().color {
163 if match_found {
164 let (seq, qual) = self.color_matched_bases(&read.seq, &read.qual);
165 read.head = color_head(&read.head);
166 read.seq = seq;
167 read.qual = qual;
168 } else {
169 read.head = color_background(&read.head);
171 read.seq = color_background(&read.seq);
172 read.qual = color_background(&read.qual);
173 }
174 }
175
176 match_found
177 }
178}
179
180pub struct FixedStringMatcher {
182 pattern: Vec<u8>,
183 opts: MatcherOpts,
184}
185
186impl Matcher for FixedStringMatcher {
187 fn bases_match(&self, bases: &[u8]) -> bool {
188 bases.find(&self.pattern).is_some() != self.opts.invert_match
189 }
190
191 fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
192 let ranges = bases.find_iter(&self.pattern).map(|start| Range {
193 start,
194 end: start + self.pattern.len(),
195 });
196 if self.opts().reverse_complement {
197 let bases_revcomp = &reverse_complement(bases);
198 let ranges_revcomp = bases_revcomp
199 .find_iter(&self.pattern)
200 .map(|start| bases.len() - start - self.pattern.len())
201 .map(|start| Range {
202 start,
203 end: start + self.pattern.len(),
204 });
205 bases_colored(bases, quals, ranges.chain(ranges_revcomp))
206 } else {
207 bases_colored(bases, quals, ranges)
208 }
209 }
210
211 fn opts(&self) -> MatcherOpts {
212 self.opts
213 }
214}
215
216impl FixedStringMatcher {
217 pub fn new(pattern: &str, opts: MatcherOpts) -> Self {
218 let pattern = pattern.as_bytes().to_vec();
219 Self { pattern, opts }
220 }
221}
222
223pub struct FixedStringSetMatcher {
225 patterns: Vec<Vec<u8>>,
226 opts: MatcherOpts,
227}
228
229impl Matcher for FixedStringSetMatcher {
230 fn bases_match(&self, bases: &[u8]) -> bool {
231 self.patterns
232 .iter()
233 .any(|pattern| bases.find(pattern).is_some())
234 != self.opts.invert_match
235 }
236
237 fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
238 let ranges = self.patterns.iter().flat_map(|pattern| {
239 bases
240 .find_iter(&pattern)
241 .map(|start| Range {
242 start,
243 end: start + pattern.len(),
244 })
245 .collect::<Vec<_>>()
246 });
247 if self.opts().reverse_complement {
248 let bases_revcomp = &reverse_complement(bases);
249 let ranges_revcomp = self.patterns.iter().flat_map(|pattern| {
250 bases_revcomp
251 .find_iter(&pattern)
252 .map(|start| bases.len() - start - pattern.len())
253 .map(|start| Range {
254 start,
255 end: start + pattern.len(),
256 })
257 .collect::<Vec<_>>()
258 });
259 bases_colored(bases, quals, ranges.chain(ranges_revcomp))
260 } else {
261 bases_colored(bases, quals, ranges)
262 }
263 }
264
265 fn opts(&self) -> MatcherOpts {
266 self.opts
267 }
268}
269
270impl FixedStringSetMatcher {
271 pub fn new<I, S>(patterns: I, opts: MatcherOpts) -> Self
272 where
273 S: AsRef<str>,
274 I: IntoIterator<Item = S>,
275 {
276 let patterns: Vec<Vec<u8>> = patterns
277 .into_iter()
278 .map(|pattern| pattern.as_ref().to_owned().as_bytes().to_vec())
279 .collect();
280 Self { patterns, opts }
281 }
282}
283
284pub struct RegexMatcher {
286 regex: Regex,
287 opts: MatcherOpts,
288}
289
290impl RegexMatcher {
291 pub fn new(pattern: &str, opts: MatcherOpts) -> Self {
292 let regex = RegexBuilder::new(pattern)
293 .build()
294 .context(format!("Invalid regular expression: {}", pattern))
295 .unwrap();
296 Self { regex, opts }
297 }
298}
299
300impl Matcher for RegexMatcher {
301 fn bases_match(&self, bases: &[u8]) -> bool {
302 self.regex.is_match(bases) != self.opts.invert_match
303 }
304
305 fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
306 let ranges = self.regex.find_iter(bases).map(|m| m.range());
307 if self.opts().reverse_complement {
308 let bases_revcomp = &reverse_complement(bases);
309 let ranges_revcomp =
310 self.regex
311 .find_iter(bases_revcomp)
312 .map(|m| m.range())
313 .map(|range| Range {
314 start: bases.len() - range.start - range.len(),
315 end: bases.len() - range.start,
316 });
317 bases_colored(bases, quals, ranges.chain(ranges_revcomp))
318 } else {
319 bases_colored(bases, quals, ranges)
320 }
321 }
322
323 fn opts(&self) -> MatcherOpts {
324 self.opts
325 }
326}
327
328pub struct RegexSetMatcher {
329 regex_set: RegexSet,
330 regex_matchers: Vec<RegexMatcher>,
331 opts: MatcherOpts,
332}
333
334impl RegexSetMatcher {
336 pub fn new<I, S>(patterns: I, opts: MatcherOpts) -> Self
337 where
338 S: AsRef<str>,
339 I: IntoIterator<Item = S>,
340 {
341 let string_patterns: Vec<String> = patterns
342 .into_iter()
343 .map(|p| p.as_ref().to_string())
344 .collect();
345 let regex_set = RegexSetBuilder::new(string_patterns.clone())
346 .build()
347 .unwrap();
348 let regex_matchers: Vec<RegexMatcher> = string_patterns
349 .into_iter()
350 .map(|pattern| RegexMatcher::new(pattern.as_ref(), opts))
351 .collect();
352 Self {
353 regex_set,
354 regex_matchers,
355 opts,
356 }
357 }
358}
359
360impl Matcher for RegexSetMatcher {
361 fn bases_match(&self, bases: &[u8]) -> bool {
362 self.regex_set.is_match(bases) != self.opts.invert_match
363 }
364
365 fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
366 let ranges = self
367 .regex_matchers
368 .iter()
369 .flat_map(|r| r.regex.find_iter(bases).map(|m| m.range()));
370 if self.opts().reverse_complement {
371 let bases_revcomp = &reverse_complement(bases);
372 let ranges_revcomp = self.regex_matchers.iter().flat_map(|r| {
373 r.regex
374 .find_iter(bases_revcomp)
375 .map(|m| m.range())
376 .map(|range| Range {
377 start: bases.len() - range.start - range.len(),
378 end: bases.len() - range.start,
379 })
380 });
381 bases_colored(bases, quals, ranges.chain(ranges_revcomp))
382 } else {
383 bases_colored(bases, quals, ranges)
384 }
385 }
386
387 fn opts(&self) -> MatcherOpts {
388 self.opts
389 }
390}
391
392pub struct MatcherFactory;
394
395impl MatcherFactory {
396 pub fn new_matcher(
397 pattern: &Option<String>,
398 fixed_strings: bool,
399 regexp: &Vec<String>,
400 match_opts: MatcherOpts,
401 ) -> Box<dyn Matcher + Sync + Send> {
402 match (fixed_strings, &pattern) {
403 (true, Some(pattern)) => Box::new(FixedStringMatcher::new(pattern, match_opts)),
404 (false, Some(pattern)) => Box::new(RegexMatcher::new(pattern, match_opts)),
405 (true, None) => Box::new(FixedStringSetMatcher::new(regexp, match_opts)),
406 (false, None) => Box::new(RegexSetMatcher::new(regexp, match_opts)),
407 }
408 }
409}
410
411#[cfg(test)]
413pub mod tests {
414 use crate::matcher::*;
415 use rstest::rstest;
416
417 fn write_owned_record(seq: &str) -> OwnedRecord {
420 let read = OwnedRecord {
421 head: ("@Sample").as_bytes().to_vec(),
422 seq: seq.as_bytes().to_vec(),
423 qual: vec![b'X'; seq.len()],
424 };
425 read
426 }
427
428 #[rstest]
433 #[case(vec![(0, 1)], "AGG", bitvec![1, 0, 0])] #[case(vec![(2, 3)], "AGG", bitvec![0, 0, 1])] #[case(vec![(1, 2)], "AGG", bitvec![0, 1, 0])] #[case(vec![(0 ,0)], "AGG", bitvec![0, 0, 0])] #[case(vec![(0, 3)], "AGG", bitvec![1, 1, 1])] #[case(vec![(1, 4)], "AGGTC", bitvec![0, 1, 1, 1, 0])] #[case(vec![(0, 2), (3, 5)], "AGGTC", bitvec![1, 1, 0, 1, 1])] #[case(vec![(0, 3), (3, 5)], "AGGTC", bitvec![1, 1, 1, 1, 1])] #[case(vec![(0, 4), (3, 5)], "AGGTC", bitvec![1, 1, 1, 1, 1])] #[case(vec![(0, 3), (0, 5)], "AGGTC", bitvec![1, 1, 1, 1, 1])] #[case(vec![(4, 5), (0, 2)], "AGGTC", bitvec![1, 1, 0, 0, 1])] fn test_to_bitvec(
445 #[case] ranges: Vec<(usize, usize)>,
446 #[case] bases: &str,
447 #[case] expected: BitVec,
448 ) {
449 let ranges = ranges
450 .into_iter()
451 .map(|(start, end)| std::ops::Range { start, end });
452 let result_bitvec = to_bitvec(ranges, bases.len());
453 assert_eq!(result_bitvec, expected);
454 }
455
456 #[rstest]
461 #[case(false, "AG", "AGG", true)] #[case(false, "CC", "AGG", false)] #[case(true, "CC", "AGG", true)] #[case(true, "TT", "AGG", false)] #[case(false, "AT", "ATGAT", true)] #[case(true, "CG", "GCCG", true)] #[case(false, "AGAG", "AGAGAGAG", true)] #[case(true, "TCTC", "AGAGAGAG", true)] fn test_fixed_string_matcher_read_match(
470 #[case] reverse_complement: bool,
471 #[case] pattern: &str,
472 #[case] seq: &str,
473 #[case] expected: bool,
474 ) {
475 let invert_matches = [true, false];
476 for invert_match in IntoIterator::into_iter(invert_matches).to_owned() {
477 let opts = MatcherOpts {
478 invert_match,
479 reverse_complement,
480 color: false,
481 };
482 let matcher = FixedStringMatcher::new(pattern, opts);
483 let mut read_record = write_owned_record(seq);
484 let result = matcher.read_match(&mut read_record);
485 if invert_match {
486 assert_ne!(result, expected);
487 } else {
488 assert_eq!(result, expected);
489 }
490 }
491 }
492
493 #[rstest]
498 #[case(false, vec!["A", "AGG", "G"], "AGGG", true)] #[case(true, vec!["A", "AGG", "G"], "TCCC", true)] #[case(false, vec!["A", "AGG", "G"], "TTTT", false)] #[case(true, vec!["T", "AAA"], "CCCCC", false)] #[case(false, vec!["AGG", "C", "TT"], "AGGTT", true)] #[case(true, vec!["AGG", "C", "TT"], "GGGGG", true)] #[case(false, vec!["AC", "TT"], "TTACGTT", true)] #[case(true, vec!["GT", "AA"], "TTACGTT", true)] #[case(false, vec!["GAGA","AGTT"], "GAGAGTT", true)] #[case(true, vec!["CTCT","AACT"], "GAGAGTT", true)] fn test_fixed_string_set_metcher_read_match(
509 #[case] reverse_complement: bool,
510 #[case] patterns: Vec<&str>,
511 #[case] seq: &str,
512 #[case] expected: bool,
513 ) {
514 let invert_matches = [true, false];
515 for invert_match in IntoIterator::into_iter(invert_matches).to_owned() {
516 let opts = MatcherOpts {
517 invert_match,
518 reverse_complement,
519 color: false,
520 };
521 let matcher = FixedStringSetMatcher::new(patterns.iter(), opts);
522 let mut read_record = write_owned_record(seq);
523 let result = matcher.read_match(&mut read_record);
524 if invert_match {
525 assert_ne!(result, expected);
526 } else {
527 assert_eq!(result, expected);
528 }
529 }
530 }
531
532 #[rstest]
537 #[case(false, "^A", "AGG", true)] #[case(false, "^T", "AGG", false)] #[case(true, "^C", "AGG", true)] #[case(true, "^T", "AGG", false)] #[case(false, "A.A", "ATATA", true)] #[case(true, "T.G", "CACACA", false)] fn test_regex_matcher_read_match(
544 #[case] reverse_complement: bool,
545 #[case] pattern: &str,
546 #[case] seq: &str,
547 #[case] expected: bool,
548 ) {
549 let invert_matches = [true, false];
550 for invert_match in IntoIterator::into_iter(invert_matches).to_owned() {
551 let opts = MatcherOpts {
552 invert_match,
553 reverse_complement,
554 color: false,
555 };
556
557 let matcher = RegexMatcher::new(&pattern, opts);
558 let mut read_record = write_owned_record(seq);
559 let result = matcher.read_match(&mut read_record);
560 if invert_match {
561 assert_ne!(result, expected);
562 } else {
563 assert_eq!(result, expected);
564 }
565 }
566 }
567
568 #[rstest]
573 #[case(false, vec!["^A.G", "C..", "$T"], "AGGCTT", true)] #[case(true, vec!["^T.C", "..G", "$A"], "AGGCTT", true)] #[case(false, vec!["^A.G", "G..", "$T"], "CCTCA", false)] #[case(true, vec!["$A", "C.CC"], "CCTCA", false)] #[case(false, vec!["^T", ".GG", "A.+G"], "ATCTACTACG", true)] #[case(true, vec!["^C", ".CC", "C+.T"], "ATCTACTACG", true)] #[case(false, vec!["^T", "T.A"], "TTAATAA", true)] #[case(true, vec!["^T", "T.A"], "AATA", true)] #[case(false, vec!["^T","T.+G"], "TAGAGTG", true)] #[case(true, vec!["^A","A.+C"], "TAGAGTG", true)] fn test_regex_set_metcher_read_match(
584 #[case] reverse_complement: bool,
585 #[case] patterns: Vec<&str>,
586 #[case] seq: &str,
587 #[case] expected: bool,
588 ) {
589 let invert_matches = [true, false];
590 for invert_match in IntoIterator::into_iter(invert_matches).to_owned() {
591 let opts = MatcherOpts {
592 invert_match,
593 reverse_complement,
594 color: false,
595 };
596
597 let matcher = RegexSetMatcher::new(patterns.iter(), opts);
598 let mut read_record = write_owned_record(seq);
599 let result = matcher.read_match(&mut read_record);
600 if invert_match {
601 assert_ne!(result, expected);
602 } else {
603 assert_eq!(result, expected);
604 }
605 }
606 }
607
608 #[test]
613 fn test_validate_fixed_pattern_is_ok() {
614 let pattern = "AGTGTGATG";
615 let result = validate_fixed_pattern(&pattern);
616 assert!(result.is_ok())
617 }
618 #[test]
619 fn test_validate_fixed_pattern_error() {
620 let pattern = "AXGTGTGATG";
621 let msg = String::from("Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG");
622 let result = validate_fixed_pattern(&pattern);
623 let inner = result.unwrap_err().to_string();
624 assert_eq!(inner, msg);
625 }
626}