Skip to main content

fqgrep_lib/
matcher.rs

1use ahash::AHashSet;
2use aho_corasick::AhoCorasick;
3use bio::data_structures::bitenc::BitEnc;
4use bitvec::prelude::*;
5use seq_io::fastq::RefRecord;
6use std::ops::Range;
7
8use crate::AMINO_ACIDS;
9use crate::DNA_BASES;
10use crate::DNA_MASK_VALUES;
11use crate::color::{COLOR_BACKGROUND, COLOR_BASES, COLOR_QUALS};
12use crate::color::{color_background, color_head};
13use crate::encode;
14use crate::reverse_complement;
15use anyhow::{Context, Result, bail};
16use bstr::ByteSlice;
17use regex::bytes::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
18use seq_io::fastq::{OwnedRecord, Record};
19
20/// Common options for pattern matchers
21#[derive(Copy, Clone, Debug)]
22pub struct MatcherOpts {
23    /// Invert the matching. The bases are said to match if they do not contain the pattern
24    pub invert_match: bool,
25    /// Include the reverse complement of the bases.  The bases are said to match if they or their
26    /// reverse complement contains the pattern.
27    pub reverse_complement: bool,
28}
29
30/// Builds a bit vector from an iterator of ranges.  The ranges may overlap each other.
31fn to_bitvec(ranges: impl Iterator<Item = Range<usize>>, len: usize) -> BitVec {
32    let mut vec = bitvec![0; len];
33    ranges.for_each(|range| {
34        for index in range {
35            vec.set(index, true);
36        }
37    });
38    vec
39}
40
41/// Color the bases and qualities based on ranges specifying where a pattern matched bases.  The
42/// ranges may overlap each other.
43fn bases_colored(
44    bases: &[u8],
45    quals: &[u8],
46    ranges: impl Iterator<Item = Range<usize>>,
47) -> (Vec<u8>, Vec<u8>) {
48    // The resulting colored bases
49    let mut colored_bases = Vec::with_capacity(bases.len());
50    let mut colored_quals = Vec::with_capacity(bases.len());
51
52    // Merge the ranges into a bit mask, with 1 indicating that base is part of a pattern match
53    let bits = to_bitvec(ranges, bases.len());
54
55    // Iterate over the bit mask, finding stretches of matching and non-matching bases.  Color both
56    // in both the bases and qualities
57    let mut last_color_on = false;
58    let mut last_bases_index = 0;
59    let mut cur_bases_index = 0;
60    for base_color_on in bits.iter() {
61        if *base_color_on {
62            // this base is to be colored
63            if !last_color_on {
64                // add up to but not including this base to the colored vector **as uncolored**
65                if last_bases_index < cur_bases_index {
66                    COLOR_BACKGROUND
67                        .paint(&bases[last_bases_index..cur_bases_index])
68                        .write_to(&mut colored_bases)
69                        .unwrap();
70                    COLOR_BACKGROUND
71                        .paint(&quals[last_bases_index..cur_bases_index])
72                        .write_to(&mut colored_quals)
73                        .unwrap();
74                }
75                // first base in a run of bases to be colored
76                last_bases_index = cur_bases_index;
77            }
78
79            last_color_on = true;
80        } else {
81            // this base is not to be colored
82            if last_color_on {
83                // add up to but not including this base to the colored vector **as colored**
84                if last_bases_index < cur_bases_index {
85                    COLOR_BASES
86                        .paint(&bases[last_bases_index..cur_bases_index])
87                        .write_to(&mut colored_bases)
88                        .unwrap();
89                    COLOR_QUALS
90                        .paint(&quals[last_bases_index..cur_bases_index])
91                        .write_to(&mut colored_quals)
92                        .unwrap();
93                }
94                // first base in a run of bases to be colored
95                last_bases_index = cur_bases_index;
96            }
97            last_color_on = false;
98        }
99        cur_bases_index += 1;
100    }
101    // Color to the end
102    if last_bases_index < cur_bases_index {
103        if last_color_on {
104            COLOR_BASES
105                .paint(&bases[last_bases_index..cur_bases_index])
106                .write_to(&mut colored_bases)
107                .unwrap();
108            COLOR_QUALS
109                .paint(&quals[last_bases_index..cur_bases_index])
110                .write_to(&mut colored_quals)
111                .unwrap();
112        } else {
113            COLOR_BACKGROUND
114                .paint(&bases[last_bases_index..cur_bases_index])
115                .write_to(&mut colored_bases)
116                .unwrap();
117            COLOR_BACKGROUND
118                .paint(&quals[last_bases_index..cur_bases_index])
119                .write_to(&mut colored_quals)
120                .unwrap();
121        }
122    }
123
124    (colored_bases, colored_quals)
125}
126
127/// Validates that a given FIXED pattern contains only valid bases (DNA or amino acid)
128pub fn validate_fixed_pattern(pattern: &str, protein: bool) -> Result<()> {
129    let valid_chars = if protein {
130        &AMINO_ACIDS[..]
131    } else {
132        &DNA_BASES[..]
133    };
134    let kind = if protein { "amino acids" } else { "DNA bases" };
135    for (index, base) in pattern.char_indices() {
136        if !base.is_ascii() || !valid_chars.contains(&(base as u8)) {
137            let next_index = index + base.len_utf8();
138            bail!(
139                "Fixed pattern must contain only {kind}: {} .. [{}] .. {}",
140                &pattern[0..index],
141                &pattern[index..next_index],
142                &pattern[next_index..],
143            )
144        }
145    }
146    Ok(())
147}
148
149/// Base trait for all pattern matchers
150pub trait Matcher {
151    /// The options for the pattern matcher
152    fn opts(&self) -> MatcherOpts;
153
154    /// Returns true if the bases match the pattern, false otherwise
155    fn bases_match(&self, bases: &[u8]) -> bool;
156
157    /// Colors the bases and qualities based on where they match the pattern.  All bases
158    /// that match the pattern are colored.  Colored in this case means adding ANSI color
159    /// codes for printing to a terminal.
160    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>);
161
162    /// Returns true if the read's bases match the pattern, false otherwise
163    #[inline]
164    fn read_match(&self, read: &RefRecord) -> bool {
165        let bases_match = self.bases_match(read.seq());
166        if self.opts().invert_match {
167            bases_match
168                && (!self.opts().reverse_complement
169                    || self.bases_match(&reverse_complement(read.seq())))
170        } else {
171            bases_match
172                || (self.opts().reverse_complement
173                    && self.bases_match(&reverse_complement(read.seq())))
174        }
175    }
176
177    /// Adds ANSI color codes to the read's header, sequence, and quality based on where they
178    /// match the pattern(s).
179    #[inline]
180    fn color(&self, read: &mut OwnedRecord, match_found: bool) {
181        if match_found {
182            let (seq, qual) = self.color_matched_bases(&read.seq, &read.qual);
183            read.head = color_head(&read.head);
184            read.seq = seq;
185            read.qual = qual;
186        } else {
187            // always color, in case the read is paired
188            read.head = color_background(&read.head);
189            read.seq = color_background(&read.seq);
190            read.qual = color_background(&read.qual);
191        }
192    }
193}
194
195/// Matcher for a fixed string pattern
196pub struct FixedStringMatcher {
197    pattern: Vec<u8>,
198    opts: MatcherOpts,
199}
200
201impl Matcher for FixedStringMatcher {
202    #[inline]
203    fn bases_match(&self, bases: &[u8]) -> bool {
204        bases.find(&self.pattern).is_some() != self.opts.invert_match
205    }
206
207    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
208        let ranges = bases.find_iter(&self.pattern).map(|start| Range {
209            start,
210            end: start + self.pattern.len(),
211        });
212        if self.opts().reverse_complement {
213            let bases_revcomp = &reverse_complement(bases);
214            let ranges_revcomp = bases_revcomp
215                .find_iter(&self.pattern)
216                .map(|start| bases.len() - start - self.pattern.len())
217                .map(|start| Range {
218                    start,
219                    end: start + self.pattern.len(),
220                });
221            bases_colored(bases, quals, ranges.chain(ranges_revcomp))
222        } else {
223            bases_colored(bases, quals, ranges)
224        }
225    }
226
227    #[inline]
228    fn opts(&self) -> MatcherOpts {
229        self.opts
230    }
231}
232
233impl FixedStringMatcher {
234    pub fn new(pattern: &str, opts: MatcherOpts) -> Self {
235        let pattern = pattern.as_bytes().to_vec();
236        Self { pattern, opts }
237    }
238}
239
240/// Matcher for a set of fixed string patterns
241pub struct FixedStringSetMatcher {
242    patterns: Vec<Vec<u8>>,
243    aho_corasick: AhoCorasick,
244    opts: MatcherOpts,
245}
246
247impl Matcher for FixedStringSetMatcher {
248    #[inline]
249    fn bases_match(&self, bases: &[u8]) -> bool {
250        self.aho_corasick.is_match(bases) != self.opts.invert_match
251    }
252
253    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
254        let ranges = self.patterns.iter().flat_map(|pattern| {
255            bases
256                .find_iter(&pattern)
257                .map(|start| Range {
258                    start,
259                    end: start + pattern.len(),
260                })
261                .collect::<Vec<_>>()
262        });
263        if self.opts().reverse_complement {
264            let bases_revcomp = &reverse_complement(bases);
265            let ranges_revcomp = self.patterns.iter().flat_map(|pattern| {
266                bases_revcomp
267                    .find_iter(&pattern)
268                    .map(|start| bases.len() - start - pattern.len())
269                    .map(|start| Range {
270                        start,
271                        end: start + pattern.len(),
272                    })
273                    .collect::<Vec<_>>()
274            });
275            bases_colored(bases, quals, ranges.chain(ranges_revcomp))
276        } else {
277            bases_colored(bases, quals, ranges)
278        }
279    }
280
281    #[inline]
282    fn opts(&self) -> MatcherOpts {
283        self.opts
284    }
285}
286
287impl FixedStringSetMatcher {
288    pub fn new<I, S>(patterns: I, opts: MatcherOpts) -> Result<Self>
289    where
290        S: AsRef<str>,
291        I: IntoIterator<Item = S>,
292    {
293        let patterns: Vec<Vec<u8>> = patterns
294            .into_iter()
295            .map(|pattern| pattern.as_ref().to_owned().as_bytes().to_vec())
296            .collect();
297        let aho_corasick =
298            AhoCorasick::new(&patterns).context("Failed to build Aho-Corasick automaton")?;
299        Ok(Self {
300            patterns,
301            aho_corasick,
302            opts,
303        })
304    }
305}
306
307/// Iterates over matching offsets where `needle` matches within `bases` using bitwise AND.
308/// When `has_iupac` is false, uses a rolling additive hash to skip non-matching offsets.
309/// Calls `on_match` for each matching offset; if it returns `false`, iteration stops early.
310fn bitenc_match_offsets(
311    bases: &BitEnc,
312    needle: &BitEnc,
313    has_iupac: bool,
314    mut on_match: impl FnMut(usize) -> bool,
315) {
316    if bases.nr_symbols() < needle.nr_symbols() {
317        return;
318    }
319
320    let mut bases_hash: usize = 0;
321    let needle_hash = if has_iupac {
322        0
323    } else {
324        (0..needle.nr_symbols())
325            .map(|i| needle.get(i).unwrap() as usize)
326            .sum()
327    };
328
329    'outer: for offset in 0..=bases.nr_symbols() - needle.nr_symbols() {
330        if !has_iupac {
331            if offset == 0 {
332                bases_hash = (0..needle.nr_symbols())
333                    .map(|i| bases.get(i).unwrap() as usize)
334                    .sum();
335            } else {
336                bases_hash += bases.get(offset + needle.nr_symbols() - 1).unwrap() as usize;
337                bases_hash -= bases.get(offset - 1).unwrap() as usize;
338            }
339            if bases_hash != needle_hash {
340                continue 'outer;
341            }
342        }
343        for i in 0..needle.nr_symbols() {
344            let base = bases.get(offset + i).unwrap();
345            if needle.get(i).unwrap() & base != base {
346                continue 'outer;
347            }
348        }
349        if !on_match(offset) {
350            return;
351        }
352    }
353}
354
355/// Finds all positions where `needle` matches within `bases` using bitwise AND matching.
356/// When `has_iupac` is false, uses a rolling additive hash to skip non-matching offsets.
357fn bitenc_find_all(bases: &BitEnc, needle: &BitEnc, has_iupac: bool) -> Vec<Range<usize>> {
358    let mut ranges: Vec<Range<usize>> = Vec::new();
359    let needle_len = needle.nr_symbols();
360    bitenc_match_offsets(bases, needle, has_iupac, |offset| {
361        ranges.push(offset..offset + needle_len);
362        true
363    });
364    ranges
365}
366
367/// Returns true if `needle` matches anywhere within `bases` using bitwise AND matching.
368/// When `has_iupac` is false, uses a rolling additive hash to skip non-matching offsets.
369fn bitenc_find(bases: &BitEnc, needle: &BitEnc, has_iupac: bool) -> bool {
370    let mut found = false;
371    bitenc_match_offsets(bases, needle, has_iupac, |_| {
372        found = true;
373        false // stop after first match
374    });
375    found
376}
377
378/// Returns true if the encoded pattern contains any IUPAC ambiguity codes
379/// (i.e., any symbol that is not a single DNA base mask value).
380fn bitenc_has_iupac(bitenc: &BitEnc) -> bool {
381    !bitenc.iter().all(|value| DNA_MASK_VALUES.contains(&value))
382}
383
384/// Matcher for a single IUPAC bitmask pattern
385pub struct BitMaskMatcher {
386    bitenc: BitEnc,
387    has_iupac: bool,
388    opts: MatcherOpts,
389}
390
391impl Matcher for BitMaskMatcher {
392    #[inline]
393    fn bases_match(&self, bases: &[u8]) -> bool {
394        let bases = encode(bases);
395        bitenc_find(&bases, &self.bitenc, self.has_iupac) != self.opts.invert_match
396    }
397
398    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
399        let encoded_bases = encode(bases);
400        let ranges = bitenc_find_all(&encoded_bases, &self.bitenc, self.has_iupac);
401        if self.opts().reverse_complement {
402            let bases_revcomp = &reverse_complement(bases);
403            let encoded_revcomp = encode(bases_revcomp);
404            let ranges_revcomp = bitenc_find_all(&encoded_revcomp, &self.bitenc, self.has_iupac)
405                .into_iter()
406                .map(|range| Range {
407                    start: bases.len() - range.start - range.len(),
408                    end: bases.len() - range.start,
409                });
410            bases_colored(bases, quals, ranges.into_iter().chain(ranges_revcomp))
411        } else {
412            bases_colored(bases, quals, ranges.into_iter())
413        }
414    }
415
416    #[inline]
417    fn opts(&self) -> MatcherOpts {
418        self.opts
419    }
420}
421
422impl BitMaskMatcher {
423    pub fn new(bitenc: BitEnc, opts: MatcherOpts) -> Self {
424        let has_iupac = bitenc_has_iupac(&bitenc);
425        Self {
426            bitenc,
427            has_iupac,
428            opts,
429        }
430    }
431}
432
433/// Matcher for a set of IUPAC bitmask patterns
434pub struct BitMaskSetMatcher {
435    bitencs: Vec<BitEnc>,
436    has_iupac: Vec<bool>,
437    opts: MatcherOpts,
438}
439
440impl Matcher for BitMaskSetMatcher {
441    fn bases_match(&self, bases: &[u8]) -> bool {
442        let bases = encode(bases);
443        self.bitencs
444            .iter()
445            .enumerate()
446            .any(|(index, needle)| bitenc_find(&bases, needle, self.has_iupac[index]))
447            != self.opts.invert_match
448    }
449
450    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
451        let encoded_bases = encode(bases);
452        let ranges = self.bitencs.iter().enumerate().flat_map(|(index, needle)| {
453            bitenc_find_all(&encoded_bases, needle, self.has_iupac[index])
454        });
455        if self.opts().reverse_complement {
456            let bases_revcomp = &reverse_complement(bases);
457            let encoded_revcomp = encode(bases_revcomp);
458            let ranges_revcomp = self.bitencs.iter().enumerate().flat_map(|(index, needle)| {
459                bitenc_find_all(&encoded_revcomp, needle, self.has_iupac[index])
460                    .into_iter()
461                    .map(|range| Range {
462                        start: bases.len() - range.start - range.len(),
463                        end: bases.len() - range.start,
464                    })
465            });
466            bases_colored(bases, quals, ranges.chain(ranges_revcomp))
467        } else {
468            bases_colored(bases, quals, ranges)
469        }
470    }
471
472    #[inline]
473    fn opts(&self) -> MatcherOpts {
474        self.opts
475    }
476}
477
478impl BitMaskSetMatcher {
479    pub fn new(bitencs: Vec<BitEnc>, opts: MatcherOpts) -> Self {
480        let has_iupac = bitencs.iter().map(bitenc_has_iupac).collect();
481        Self {
482            bitencs,
483            has_iupac,
484            opts,
485        }
486    }
487}
488
489/// Matcher for a regular expression pattern
490pub struct RegexMatcher {
491    regex: Regex,
492    opts: MatcherOpts,
493}
494
495impl RegexMatcher {
496    pub fn new(pattern: &str, opts: MatcherOpts) -> Result<Self> {
497        let regex = RegexBuilder::new(pattern)
498            .build()
499            .with_context(|| format!("Invalid regular expression: '{}'", pattern))?;
500        Ok(Self { regex, opts })
501    }
502}
503
504impl Matcher for RegexMatcher {
505    #[inline]
506    fn bases_match(&self, bases: &[u8]) -> bool {
507        self.regex.is_match(bases) != self.opts.invert_match
508    }
509
510    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
511        let ranges = self.regex.find_iter(bases).map(|m| m.range());
512        if self.opts().reverse_complement {
513            let bases_revcomp = &reverse_complement(bases);
514            let ranges_revcomp =
515                self.regex
516                    .find_iter(bases_revcomp)
517                    .map(|m| m.range())
518                    .map(|range| Range {
519                        start: bases.len() - range.start - range.len(),
520                        end: bases.len() - range.start,
521                    });
522            bases_colored(bases, quals, ranges.chain(ranges_revcomp))
523        } else {
524            bases_colored(bases, quals, ranges)
525        }
526    }
527
528    #[inline]
529    fn opts(&self) -> MatcherOpts {
530        self.opts
531    }
532}
533
534pub struct RegexSetMatcher {
535    regex_set: RegexSet,
536    regex_matchers: Vec<RegexMatcher>,
537    opts: MatcherOpts,
538}
539
540/// Matcher for a set of regular expression patterns
541impl RegexSetMatcher {
542    pub fn new<I, S>(patterns: I, opts: MatcherOpts) -> Result<Self>
543    where
544        S: AsRef<str>,
545        I: IntoIterator<Item = S>,
546    {
547        let string_patterns: Vec<String> = patterns
548            .into_iter()
549            .map(|p| p.as_ref().to_string())
550            .collect();
551        let regex_set = RegexSetBuilder::new(string_patterns.clone())
552            .dfa_size_limit(usize::MAX)
553            .build()
554            .context("Failed to build regex set from patterns")?;
555        let regex_matchers: Vec<RegexMatcher> = string_patterns
556            .into_iter()
557            .map(|pattern| RegexMatcher::new(pattern.as_ref(), opts))
558            .collect::<Result<Vec<_>>>()?;
559        Ok(Self {
560            regex_set,
561            regex_matchers,
562            opts,
563        })
564    }
565}
566
567impl Matcher for RegexSetMatcher {
568    #[inline]
569    fn bases_match(&self, bases: &[u8]) -> bool {
570        self.regex_set.is_match(bases) != self.opts.invert_match
571    }
572
573    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
574        let ranges = self
575            .regex_matchers
576            .iter()
577            .flat_map(|r| r.regex.find_iter(bases).map(|m| m.range()));
578        if self.opts().reverse_complement {
579            let bases_revcomp = &reverse_complement(bases);
580            let ranges_revcomp = self.regex_matchers.iter().flat_map(|r| {
581                r.regex
582                    .find_iter(bases_revcomp)
583                    .map(|m| m.range())
584                    .map(|range| Range {
585                        start: bases.len() - range.start - range.len(),
586                        end: bases.len() - range.start,
587                    })
588            });
589            bases_colored(bases, quals, ranges.chain(ranges_revcomp))
590        } else {
591            bases_colored(bases, quals, ranges)
592        }
593    }
594
595    #[inline]
596    fn opts(&self) -> MatcherOpts {
597        self.opts
598    }
599}
600
601/// Matcher for query names (read IDs).  Matches reads whose ID (the portion of the header before
602/// the first whitespace) exactly matches one of the query names in the provided set.
603pub struct QueryNameMatcher {
604    query_names: AHashSet<Vec<u8>>,
605    opts: MatcherOpts,
606}
607
608impl QueryNameMatcher {
609    pub fn new(query_names: AHashSet<Vec<u8>>, opts: MatcherOpts) -> Self {
610        Self { query_names, opts }
611    }
612}
613
614impl Matcher for QueryNameMatcher {
615    /// Not used — matching is by query name, not sequence. See `read_match()`.
616    #[inline]
617    fn bases_match(&self, _bases: &[u8]) -> bool {
618        !self.opts.invert_match
619    }
620
621    fn color_matched_bases(&self, bases: &[u8], quals: &[u8]) -> (Vec<u8>, Vec<u8>) {
622        (bases.to_vec(), quals.to_vec())
623    }
624
625    #[inline]
626    fn opts(&self) -> MatcherOpts {
627        self.opts
628    }
629
630    #[inline]
631    fn read_match(&self, read: &RefRecord) -> bool {
632        let id_match = self.query_names.contains(read.id_bytes());
633        if self.opts.invert_match {
634            !id_match
635        } else {
636            id_match
637        }
638    }
639}
640
641/// Factory for building a matcher
642pub struct MatcherFactory;
643
644impl MatcherFactory {
645    pub fn new_matcher(
646        pattern: &Option<String>,
647        fixed_strings: bool,
648        regexp: &Vec<String>,
649        bitencs: Option<Vec<BitEnc>>,
650        match_opts: MatcherOpts,
651    ) -> Result<Box<dyn Matcher + Sync + Send>> {
652        let use_bitmask = bitencs.is_some();
653        let is_single = pattern.is_some();
654        match (use_bitmask, is_single, fixed_strings) {
655            (true, true, _) => {
656                let bitencs = bitencs.unwrap();
657                Ok(Box::new(BitMaskMatcher::new(
658                    bitencs.into_iter().next().unwrap(),
659                    match_opts,
660                )))
661            }
662            (true, false, _) => Ok(Box::new(BitMaskSetMatcher::new(
663                bitencs.unwrap(),
664                match_opts,
665            ))),
666            (false, true, true) => Ok(Box::new(FixedStringMatcher::new(
667                pattern.as_ref().unwrap(),
668                match_opts,
669            ))),
670            (false, true, false) => Ok(Box::new(RegexMatcher::new(
671                pattern.as_ref().unwrap(),
672                match_opts,
673            )?)),
674            (false, false, true) => Ok(Box::new(FixedStringSetMatcher::new(regexp, match_opts)?)),
675            (false, false, false) => Ok(Box::new(RegexSetMatcher::new(regexp, match_opts)?)),
676        }
677    }
678
679    /// Create a matcher for query names (read IDs)
680    pub fn new_query_name_matcher(
681        query_names: AHashSet<Vec<u8>>,
682        match_opts: MatcherOpts,
683    ) -> Box<dyn Matcher + Sync + Send> {
684        Box::new(QueryNameMatcher::new(query_names, match_opts))
685    }
686}
687
688// Tests
689#[cfg(test)]
690pub mod tests {
691    use crate::encode;
692    use crate::matcher::*;
693    use ahash::AHashSet;
694    use rstest::rstest;
695
696    // ############################################################################################
697    // Tests to_bitvec()
698    // ############################################################################################
699
700    #[rstest]
701    #[case(vec![(0, 1)], "AGG",  bitvec![1, 0, 0])] // single range that starts at the beginning of the read
702    #[case(vec![(2, 3)], "AGG", bitvec![0, 0, 1])] // single range that ends at the ends of the read
703    #[case(vec![(1, 2)], "AGG", bitvec![0, 1, 0])] // single range of a single base
704    #[case(vec![(0 ,0)], "AGG", bitvec![0, 0, 0])] // empty set of ranges
705    #[case(vec![(0, 3)], "AGG", bitvec![1, 1, 1])] // single range that encompasses the full read
706    #[case(vec![(1, 4)], "AGGTC", bitvec![0, 1, 1, 1, 0])] // single range in the "middle" of the read
707    #[case(vec![(0, 2), (3, 5)], "AGGTC", bitvec![1, 1, 0, 1, 1])] // two ranges non-overlapping
708    #[case(vec![(0, 3), (3, 5)], "AGGTC", bitvec![1, 1, 1, 1, 1])] // two ranges abutting
709    #[case(vec![(0, 4), (3, 5)], "AGGTC", bitvec![1, 1, 1, 1, 1])] // two ranges overlapping
710    #[case(vec![(0, 3), (0, 5)], "AGGTC", bitvec![1, 1, 1, 1, 1])] // two ranges, one containing the other
711    #[case(vec![(4, 5), (0, 2)], "AGGTC", bitvec![1, 1, 0, 0, 1])] // multiple ranges, but not in sorted order
712    fn test_to_bitvec(
713        #[case] ranges: Vec<(usize, usize)>,
714        #[case] bases: &str,
715        #[case] expected: BitVec,
716    ) {
717        let ranges = ranges
718            .into_iter()
719            .map(|(start, end)| std::ops::Range { start, end });
720        let result_bitvec = to_bitvec(ranges, bases.len());
721        assert_eq!(result_bitvec, expected);
722    }
723
724    // ############################################################################################
725    // Tests bases_colored() single-base boundary
726    // ############################################################################################
727
728    /// Regression test: a single-base match at the start, middle, and end of a sequence must be
729    /// colored.  Before the off-by-one fix (`last_bases_index + 1 < cur_bases_index` →
730    /// `last_bases_index < cur_bases_index`), single-base matches produced empty output.
731    #[test]
732    fn test_single_base_match_colored() {
733        let bases = b"ACGT";
734        let quals = b"IIII";
735
736        // Single-base match at position 0 ("A" in "ACGT")
737        let ranges_start = vec![Range { start: 0, end: 1 }];
738        let (colored_bases, _) = bases_colored(bases, quals, ranges_start.into_iter());
739        assert!(
740            !colored_bases.is_empty(),
741            "single-base match at start must produce colored output"
742        );
743
744        // Single-base match in the middle (position 2, "G" in "ACGT")
745        let ranges_mid = vec![Range { start: 2, end: 3 }];
746        let (colored_bases, _) = bases_colored(bases, quals, ranges_mid.into_iter());
747        assert!(
748            !colored_bases.is_empty(),
749            "single-base match in middle must produce colored output"
750        );
751
752        // Single-base match at the end (position 3, "T" in "ACGT")
753        let ranges_end = vec![Range { start: 3, end: 4 }];
754        let (colored_bases, _) = bases_colored(bases, quals, ranges_end.into_iter());
755        assert!(
756            !colored_bases.is_empty(),
757            "single-base match at end must produce colored output"
758        );
759
760        // Verify the colored output contains the actual base bytes (not just ANSI escapes)
761        // A single-base match ("G" at position 2 in "ACGT") should produce output containing
762        // all four bases with coloring applied
763        let ranges = vec![Range { start: 2, end: 3 }];
764        let (colored_bases, colored_quals) = bases_colored(bases, quals, ranges.into_iter());
765        let colored_str = String::from_utf8_lossy(&colored_bases);
766        let colored_qual_str = String::from_utf8_lossy(&colored_quals);
767        assert!(
768            colored_str.contains("AC"),
769            "uncolored prefix must be present"
770        );
771        assert!(colored_str.contains("G"), "colored base must be present");
772        assert!(
773            colored_str.contains("T"),
774            "uncolored suffix must be present"
775        );
776        assert!(
777            colored_qual_str.contains("II"),
778            "uncolored qual prefix must be present"
779        );
780    }
781
782    // ############################################################################################
783    // Test FixedStringMatcher::read_match()
784    // ############################################################################################
785
786    #[rstest]
787    #[case(false, "AG", "AGG", true)] // fixed string with match when reverse_complement is false
788    #[case(false, "CC", "AGG", false)] // fixed string with no match when reverse_complement is false
789    #[case(true, "CC", "AGG", true)] // fixed string with match when reverse_complement is true
790    #[case(true, "TT", "AGG", false)] // fixed string with no match when reverse_complement is true
791    #[case(false, "AT", "ATGAT", true)] // fixed string with multiple non-overlapping matches when reverse_complement is false
792    #[case(true, "CG", "GCCG", true)] // fixed string with multiple non-overlapping matches when reverse_complement is true
793    #[case(false, "AGAG", "AGAGAGAG", true)] // fixed string with overlapping matches when reverse_complemet is false
794    #[case(true, "TCTC", "AGAGAGAG", true)] // fixed string with overlapping matches when reverse_complemet is true
795    fn test_fixed_string_matcher_read_match(
796        #[case] reverse_complement: bool,
797        #[case] pattern: &str,
798        #[case] seq: &str,
799        #[case] expected: bool,
800    ) {
801        let invert_matches = [true, false];
802        for invert_match in IntoIterator::into_iter(invert_matches) {
803            let opts = MatcherOpts {
804                invert_match,
805                reverse_complement,
806            };
807            let matcher = FixedStringMatcher::new(pattern, opts);
808            let qual = (0..seq.len()).map(|_| "X").collect::<String>();
809            let record = format!("@id\n{seq}\n+\n{qual}\n");
810            let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
811            let read_record = reader.next().unwrap().unwrap();
812            let result = matcher.read_match(&read_record);
813            if invert_match {
814                assert_ne!(result, expected);
815            } else {
816                assert_eq!(result, expected);
817            }
818        }
819    }
820
821    // ############################################################################################
822    // Tests FixedStringSetMatcher::read_match()
823    // ############################################################################################
824
825    #[rstest]
826    #[case(false, vec!["A", "AGG", "G"], "AGGG", true)] // match is true when reverse_complement is false
827    #[case(true, vec!["A", "AGG", "G"], "TCCC", true)] // match is true when reverse_complement is true
828    #[case(false, vec!["A", "AGG", "G"], "TTTT", false)] // match is false when reverse_complement is false
829    #[case(true, vec!["T", "AAA"], "CCCCC", false)] // match is false when reverse_complement is true
830    #[case(false, vec!["AGG", "C", "TT"], "AGGTT",  true)] // match is true but not all patterns match when reverse_complement is false
831    #[case(true, vec!["AGG", "C", "TT"], "GGGGG",  true)] // match is true but not all patterns match when reverse_complement is true
832    #[case(false, vec!["AC", "TT"], "TTACGTT",  true)] // match is true but one pattern matches multiple times when reverse_complement is false
833    #[case(true, vec!["GT", "AA"], "TTACGTT",  true)] // match is true but one pattern matches multiple times when reverse_complement is true
834    #[case(false, vec!["GAGA","AGTT"], "GAGAGTT",  true)] // match is true with overlapping matches when reverse_complment is false
835    #[case(true, vec!["CTCT","AACT"], "GAGAGTT",  true)] // match is true with overlapping matches when reverse_complment is true
836    fn test_fixed_string_set_metcher_read_match(
837        #[case] reverse_complement: bool,
838        #[case] patterns: Vec<&str>,
839        #[case] seq: &str,
840        #[case] expected: bool,
841    ) {
842        let invert_matches = [true, false];
843        for invert_match in IntoIterator::into_iter(invert_matches) {
844            let opts = MatcherOpts {
845                invert_match,
846                reverse_complement,
847            };
848            let matcher = FixedStringSetMatcher::new(patterns.iter(), opts).unwrap();
849            let qual = (0..seq.len()).map(|_| "X").collect::<String>();
850            let record = format!("@id\n{seq}\n+\n{qual}\n");
851            let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
852            let read_record = reader.next().unwrap().unwrap();
853            let result = matcher.read_match(&read_record);
854            if invert_match {
855                assert_ne!(result, expected);
856            } else {
857                assert_eq!(result, expected);
858            }
859        }
860    }
861
862    // ############################################################################################
863    // Test RegexMatcher::read_match()
864    // ############################################################################################
865
866    #[rstest]
867    #[case(false, "^A", "AGG", true)] // regex with one match when reverse_complement is false
868    #[case(false, "^T", "AGG", false)] // regex with no matches when reverse_complement is false
869    #[case(true, "^C", "AGG", true)] // regex with one match when reverse_complement is true
870    #[case(true, "^T", "AGG", false)] // regex with no matches when reverse_complement is true
871    #[case(false, "A.A", "ATATA", true)] // regex with overlapping matches when reverse_complement is false
872    #[case(true, "T.G", "CACACA", false)] // regex with overlapping matches when reverse_complement is true
873    fn test_regex_matcher_read_match(
874        #[case] reverse_complement: bool,
875        #[case] pattern: &str,
876        #[case] seq: &str,
877        #[case] expected: bool,
878    ) {
879        let invert_matches = [true, false];
880        for invert_match in IntoIterator::into_iter(invert_matches) {
881            let opts = MatcherOpts {
882                invert_match,
883                reverse_complement,
884            };
885
886            let matcher = RegexMatcher::new(pattern, opts).unwrap();
887            let qual = (0..seq.len()).map(|_| "X").collect::<String>();
888            let record = format!("@id\n{seq}\n+\n{qual}\n");
889            let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
890            let read_record = reader.next().unwrap().unwrap();
891            let result = matcher.read_match(&read_record);
892            if invert_match {
893                assert_ne!(result, expected);
894            } else {
895                assert_eq!(result, expected);
896            }
897        }
898    }
899
900    // ############################################################################################
901    // Tests RegexSetMatcher::read_match()
902    // ############################################################################################
903
904    #[rstest]
905    #[case(false, vec!["^A.G", "C..", "$T"], "AGGCTT", true)] // match is true when reverse_complement is false
906    #[case(true, vec!["^T.C", "..G", "$A"], "AGGCTT", true)] // match is true when reverse_complement is true
907    #[case(false, vec!["^A.G", "G..", "$T"], "CCTCA", false)] // match is false when reverse_complemet is false
908    #[case(true, vec!["$A", "C.CC"], "CCTCA", false)] // match is false when reverse_complemet is true
909    #[case(false, vec!["^T", ".GG", "A.+G"],  "ATCTACTACG",  true)] // match is true but not all patterns match when reverse_complement is false
910    #[case(true, vec!["^C", ".CC", "C+.T"],  "ATCTACTACG",  true)] // match is true when reverse_complement is true
911    #[case(false, vec!["^T", "T.A"], "TTAATAA", true)] // match is true but one pattern matches multiple times when reverse_complement is false
912    #[case(true, vec!["^T", "T.A"], "AATA", true)] // match is true but one pattern matches multiple times when reverse_complemetn is true
913    #[case(false, vec!["^T","T.+G"], "TAGAGTG",  true)] // match is true with overlapping matches when reverse_complement is false
914    #[case(true, vec!["^A","A.+C"], "TAGAGTG",  true)] // match is true with overlapping matches when reverse_complement is true
915    fn test_regex_set_metcher_read_match(
916        #[case] reverse_complement: bool,
917        #[case] patterns: Vec<&str>,
918        #[case] seq: &str,
919        #[case] expected: bool,
920    ) {
921        let invert_matches = [true, false];
922        for invert_match in IntoIterator::into_iter(invert_matches) {
923            let opts = MatcherOpts {
924                invert_match,
925                reverse_complement,
926            };
927
928            let matcher = RegexSetMatcher::new(patterns.iter(), opts).unwrap();
929            let qual = (0..seq.len()).map(|_| "X").collect::<String>();
930            let record = format!("@id\n{seq}\n+\n{qual}\n");
931            let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
932            let read_record = reader.next().unwrap().unwrap();
933            let result = matcher.read_match(&read_record);
934            if invert_match {
935                assert_ne!(result, expected);
936            } else {
937                assert_eq!(result, expected);
938            }
939        }
940    }
941
942    // ############################################################################################
943    // Tests QueryNameMatcher::read_match()
944    // ############################################################################################
945
946    #[rstest]
947    #[case(false, vec!["read1", "read2"], "read1", true)] // exact match found
948    #[case(false, vec!["read1", "read2"], "read3", false)] // no match
949    #[case(false, vec!["read1"], "read1_extra", false)] // partial match should fail (query name is a prefix of read ID)
950    #[case(false, vec!["read1_extra"], "read1", false)] // partial match should fail (read ID is a prefix of query name)
951    #[case(true, vec!["read1", "read2"], "read1", false)] // invert_match: found becomes false
952    #[case(true, vec!["read1", "read2"], "read3", true)] // invert_match: not found becomes true
953    fn test_query_name_matcher_read_match(
954        #[case] invert_match: bool,
955        #[case] query_names: Vec<&str>,
956        #[case] read_id: &str,
957        #[case] expected: bool,
958    ) {
959        let opts = MatcherOpts {
960            invert_match,
961            reverse_complement: false,
962        };
963        let names: AHashSet<Vec<u8>> = query_names.iter().map(|s| s.as_bytes().to_vec()).collect();
964        let matcher = QueryNameMatcher::new(names, opts);
965        let seq = "ACGT";
966        let qual = "XXXX";
967        let record = format!("@{read_id} description\n{seq}\n+\n{qual}\n");
968        let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
969        let read_record = reader.next().unwrap().unwrap();
970        assert_eq!(matcher.read_match(&read_record), expected);
971    }
972
973    #[test]
974    fn test_query_name_matcher_with_header_description() {
975        // Tests that only the read ID (before first whitespace) is matched, not the full header
976        let opts = MatcherOpts {
977            invert_match: false,
978            reverse_complement: false,
979        };
980        let names: AHashSet<Vec<u8>> = vec!["SRR001666.1".as_bytes().to_vec()]
981            .into_iter()
982            .collect();
983        let matcher = QueryNameMatcher::new(names, opts);
984
985        // Record with description after read ID
986        let record = "@SRR001666.1 071112_SLXA description\nACGT\n+\nXXXX\n";
987        let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
988        let read_record = reader.next().unwrap().unwrap();
989        assert!(matcher.read_match(&read_record));
990    }
991
992    // ############################################################################################
993    // Tests validate_fixed_pattern()
994    // ############################################################################################
995
996    #[rstest]
997    #[case("AGTGTGATG", false)]
998    #[case("QRNQRNQRN", true)]
999    #[case("ARNDCEQGHILKMFPSTWYV", true)] // all standard 20
1000    #[case("BJOUXZ", true)] // extended IUPAC: B, J, O, U, X, Z
1001    fn test_validate_fixed_pattern_is_ok(#[case] pattern: &str, #[case] protein: bool) {
1002        let result = validate_fixed_pattern(pattern, protein);
1003        assert!(result.is_ok());
1004    }
1005
1006    #[rstest]
1007    #[case(
1008        "AXGTGTGATG",
1009        false,
1010        "Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG"
1011    )]
1012    #[case(
1013        "QRN1QRN",
1014        true,
1015        "Fixed pattern must contain only amino acids: QRN .. [1] .. QRN"
1016    )]
1017    fn test_validate_fixed_pattern_error(
1018        #[case] pattern: &str,
1019        #[case] protein: bool,
1020        #[case] msg: &str,
1021    ) {
1022        let result = validate_fixed_pattern(pattern, protein);
1023        let inner = result.unwrap_err().to_string();
1024        assert_eq!(inner, msg);
1025    }
1026
1027    #[test]
1028    fn test_validate_fixed_pattern_rejects_non_ascii() {
1029        // A non-ASCII character whose low byte happens to match 'A' (0x41)
1030        // U+0141 (Latin capital letter L with stroke) has value 321, low byte = 0x41 = 'A'
1031        let pattern = "AC\u{0141}T";
1032        let result = validate_fixed_pattern(pattern, false);
1033        assert!(result.is_err());
1034        assert!(
1035            result
1036                .unwrap_err()
1037                .to_string()
1038                .contains("Fixed pattern must contain only DNA bases")
1039        );
1040    }
1041
1042    // ############################################################################################
1043    // Tests BitMaskMatcher: has_iupac detection
1044    // ############################################################################################
1045
1046    #[test]
1047    fn test_bitmask_matcher_dna_only_has_no_iupac() {
1048        let opts = MatcherOpts {
1049            invert_match: false,
1050            reverse_complement: false,
1051        };
1052        let bitenc = encode(b"ACGT");
1053        let matcher = BitMaskMatcher::new(bitenc, opts);
1054        assert!(
1055            !matcher.has_iupac,
1056            "DNA-only pattern should have has_iupac=false"
1057        );
1058    }
1059
1060    #[test]
1061    fn test_bitmask_matcher_iupac_pattern_has_iupac() {
1062        let opts = MatcherOpts {
1063            invert_match: false,
1064            reverse_complement: false,
1065        };
1066        let bitenc = encode(b"ACRT");
1067        let matcher = BitMaskMatcher::new(bitenc, opts);
1068        assert!(
1069            matcher.has_iupac,
1070            "IUPAC pattern should have has_iupac=true"
1071        );
1072    }
1073
1074    // ############################################################################################
1075    // Tests BitMaskMatcher: edge cases
1076    // ############################################################################################
1077
1078    #[test]
1079    fn test_bitmask_matcher_read_shorter_than_pattern() {
1080        let opts = MatcherOpts {
1081            invert_match: false,
1082            reverse_complement: false,
1083        };
1084        let bitenc = encode(b"ACGTACGT");
1085        let matcher = BitMaskMatcher::new(bitenc, opts);
1086        let seq = "ACG";
1087        let qual = (0..seq.len()).map(|_| "X").collect::<String>();
1088        let record = format!("@id\n{seq}\n+\n{qual}\n");
1089        let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
1090        let read_record = reader.next().unwrap().unwrap();
1091        assert!(!matcher.read_match(&read_record));
1092    }
1093
1094    #[test]
1095    fn test_bitmask_set_matcher_read_shorter_than_pattern() {
1096        let opts = MatcherOpts {
1097            invert_match: false,
1098            reverse_complement: false,
1099        };
1100        let bitencs = vec![encode(b"ACGTACGT")];
1101        let matcher = BitMaskSetMatcher::new(bitencs, opts);
1102        let seq = "ACG";
1103        let qual = (0..seq.len()).map(|_| "X").collect::<String>();
1104        let record = format!("@id\n{seq}\n+\n{qual}\n");
1105        let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
1106        let read_record = reader.next().unwrap().unwrap();
1107        assert!(!matcher.read_match(&read_record));
1108    }
1109
1110    // ############################################################################################
1111    // Test BitMaskMatcher::read_match() with DNA-only patterns
1112    // ############################################################################################
1113
1114    #[rstest]
1115    #[case(false, "AG", "AGG", true)]
1116    #[case(false, "CC", "AGG", false)]
1117    #[case(true, "CC", "AGG", true)]
1118    #[case(true, "TT", "AGG", false)]
1119    #[case(false, "AT", "ATGAT", true)]
1120    #[case(true, "CG", "GCCG", true)]
1121    #[case(false, "AGAG", "AGAGAGAG", true)]
1122    #[case(true, "TCTC", "AGAGAGAG", true)]
1123    fn test_bitmask_matcher_dna_read_match(
1124        #[case] reverse_complement: bool,
1125        #[case] pattern: &str,
1126        #[case] seq: &str,
1127        #[case] expected: bool,
1128    ) {
1129        let invert_matches = [true, false];
1130        for invert_match in IntoIterator::into_iter(invert_matches) {
1131            let opts = MatcherOpts {
1132                invert_match,
1133                reverse_complement,
1134            };
1135            let matcher = BitMaskMatcher::new(encode(pattern.as_bytes()), opts);
1136            let qual = (0..seq.len()).map(|_| "X").collect::<String>();
1137            let record = format!("@id\n{seq}\n+\n{qual}\n");
1138            let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
1139            let read_record = reader.next().unwrap().unwrap();
1140            let result = matcher.read_match(&read_record);
1141            if invert_match {
1142                assert_ne!(result, expected);
1143            } else {
1144                assert_eq!(result, expected);
1145            }
1146        }
1147    }
1148
1149    // ############################################################################################
1150    // Test BitMaskMatcher::read_match() with IUPAC patterns
1151    // ############################################################################################
1152
1153    #[rstest]
1154    #[case(false, "GATK", "GATG", true)]
1155    #[case(false, "GATK", "GATT", true)]
1156    #[case(false, "GATK", "GATA", false)]
1157    #[case(false, "GATK", "GATC", false)]
1158    #[case(false, "GATR", "GATGA", true)]
1159    #[case(false, "GATR", "GATA", true)]
1160    #[case(false, "GATR", "GATGT", true)]
1161    #[case(false, "N", "A", true)]
1162    #[case(false, "N", "C", true)]
1163    #[case(false, "N", "G", true)]
1164    #[case(false, "N", "T", true)]
1165    #[case(false, "ANA", "ACA", true)]
1166    #[case(false, "ANA", "AAA", true)]
1167    #[case(true, "GATK", "CATC", true)]
1168    fn test_bitmask_matcher_iupac_read_match(
1169        #[case] reverse_complement: bool,
1170        #[case] pattern: &str,
1171        #[case] seq: &str,
1172        #[case] expected: bool,
1173    ) {
1174        let opts = MatcherOpts {
1175            invert_match: false,
1176            reverse_complement,
1177        };
1178        let matcher = BitMaskMatcher::new(encode(pattern.as_bytes()), opts);
1179        let qual = (0..seq.len()).map(|_| "X").collect::<String>();
1180        let record = format!("@id\n{seq}\n+\n{qual}\n");
1181        let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
1182        let read_record = reader.next().unwrap().unwrap();
1183        assert_eq!(
1184            matcher.read_match(&read_record),
1185            expected,
1186            "pattern={} seq={} revcomp={}",
1187            pattern,
1188            seq,
1189            reverse_complement
1190        );
1191    }
1192
1193    // ############################################################################################
1194    // Tests BitMaskSetMatcher::read_match() with DNA-only patterns
1195    // ############################################################################################
1196
1197    #[rstest]
1198    #[case(false, vec!["A", "AGG", "G"], "AGGG", true)]
1199    #[case(true, vec!["A", "AGG", "G"], "TCCC", true)]
1200    #[case(false, vec!["A", "AGG", "G"], "TTTT", false)]
1201    #[case(true, vec!["T", "AAA"], "CCCCC", false)]
1202    #[case(false, vec!["AGG", "C", "TT"], "AGGTT", true)]
1203    #[case(true, vec!["AGG", "C", "TT"], "GGGGG", true)]
1204    #[case(false, vec!["AC", "TT"], "TTACGTT", true)]
1205    #[case(true, vec!["GT", "AA"], "TTACGTT", true)]
1206    #[case(false, vec!["GAGA","AGTT"], "GAGAGTT", true)]
1207    #[case(true, vec!["CTCT","AACT"], "GAGAGTT", true)]
1208    fn test_bitmask_set_matcher_dna_read_match(
1209        #[case] reverse_complement: bool,
1210        #[case] patterns: Vec<&str>,
1211        #[case] seq: &str,
1212        #[case] expected: bool,
1213    ) {
1214        let invert_matches = [true, false];
1215        for invert_match in IntoIterator::into_iter(invert_matches) {
1216            let opts = MatcherOpts {
1217                invert_match,
1218                reverse_complement,
1219            };
1220            let bitencs: Vec<_> = patterns.iter().map(|p| encode(p.as_bytes())).collect();
1221            let matcher = BitMaskSetMatcher::new(bitencs, opts);
1222            let qual = (0..seq.len()).map(|_| "X").collect::<String>();
1223            let record = format!("@id\n{seq}\n+\n{qual}\n");
1224            let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
1225            let read_record = reader.next().unwrap().unwrap();
1226            let result = matcher.read_match(&read_record);
1227            if invert_match {
1228                assert_ne!(result, expected);
1229            } else {
1230                assert_eq!(result, expected);
1231            }
1232        }
1233    }
1234
1235    // ############################################################################################
1236    // Tests BitMaskSetMatcher::read_match() with IUPAC patterns
1237    // ############################################################################################
1238
1239    #[rstest]
1240    #[case(false, vec!["GATK", "ANA"], "GATG", true)]
1241    #[case(false, vec!["GATK", "ANA"], "ACA", true)]
1242    #[case(false, vec!["GATK", "ANA"], "CCCC", false)]
1243    #[case(false, vec!["R", "Y"], "A", true)]
1244    #[case(false, vec!["R", "Y"], "C", true)]
1245    #[case(false, vec!["R", "Y"], "AAAA", true)]
1246    fn test_bitmask_set_matcher_iupac_read_match(
1247        #[case] reverse_complement: bool,
1248        #[case] patterns: Vec<&str>,
1249        #[case] seq: &str,
1250        #[case] expected: bool,
1251    ) {
1252        let opts = MatcherOpts {
1253            invert_match: false,
1254            reverse_complement,
1255        };
1256        let bitencs: Vec<_> = patterns.iter().map(|p| encode(p.as_bytes())).collect();
1257        let matcher = BitMaskSetMatcher::new(bitencs, opts);
1258        let qual = (0..seq.len()).map(|_| "X").collect::<String>();
1259        let record = format!("@id\n{seq}\n+\n{qual}\n");
1260        let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
1261        let read_record = reader.next().unwrap().unwrap();
1262        assert_eq!(
1263            matcher.read_match(&read_record),
1264            expected,
1265            "patterns={:?} seq={} revcomp={}",
1266            patterns,
1267            seq,
1268            reverse_complement
1269        );
1270    }
1271
1272    // ############################################################################################
1273    // Tests for reverse complement matching via FixedStringMatcher
1274    // ############################################################################################
1275
1276    /// Helper: creates a RefRecord from a sequence string and calls `read_match` on the matcher.
1277    fn read_match_seq(matcher: &dyn Matcher, seq: &str) -> bool {
1278        let qual = "X".repeat(seq.len());
1279        let record = format!("@id\n{seq}\n+\n{qual}\n");
1280        let mut reader = seq_io::fastq::Reader::new(record.as_bytes());
1281        let rec = reader.next().unwrap().unwrap();
1282        matcher.read_match(&rec)
1283    }
1284
1285    #[test]
1286    fn test_rc_match_simple() {
1287        // Pattern "AAA" matches forward in AAATTTCCC
1288        let opts_fwd = MatcherOpts {
1289            invert_match: false,
1290            reverse_complement: false,
1291        };
1292        let opts_rc = MatcherOpts {
1293            invert_match: false,
1294            reverse_complement: true,
1295        };
1296        let matcher_fwd = FixedStringMatcher::new("AAA", opts_fwd);
1297        let matcher_rc = FixedStringMatcher::new("AAA", opts_rc);
1298
1299        assert!(read_match_seq(&matcher_fwd, "AAATTTCCC"));
1300        assert!(read_match_seq(&matcher_rc, "AAATTTCCC"));
1301    }
1302
1303    #[test]
1304    fn test_rc_match_only_in_rc() {
1305        // Pattern "TTT" does NOT match forward in CCCGGGAAA, but DOES match the RC (TTTCCCGGG)
1306        let opts_fwd = MatcherOpts {
1307            invert_match: false,
1308            reverse_complement: false,
1309        };
1310        let opts_rc = MatcherOpts {
1311            invert_match: false,
1312            reverse_complement: true,
1313        };
1314        let matcher_fwd = FixedStringMatcher::new("TTT", opts_fwd);
1315        let matcher_rc = FixedStringMatcher::new("TTT", opts_rc);
1316
1317        assert!(!read_match_seq(&matcher_fwd, "CCCGGGAAA"));
1318        assert!(read_match_seq(&matcher_rc, "CCCGGGAAA"));
1319    }
1320
1321    #[test]
1322    fn test_rc_match_at_end() {
1323        // Pattern "AAA" does NOT match forward in CCCGGGTTT, but DOES match the RC (AAACCCGGG)
1324        let opts_fwd = MatcherOpts {
1325            invert_match: false,
1326            reverse_complement: false,
1327        };
1328        let opts_rc = MatcherOpts {
1329            invert_match: false,
1330            reverse_complement: true,
1331        };
1332        let matcher_fwd = FixedStringMatcher::new("AAA", opts_fwd);
1333        let matcher_rc = FixedStringMatcher::new("AAA", opts_rc);
1334
1335        assert!(!read_match_seq(&matcher_fwd, "CCCGGGTTT"));
1336        assert!(read_match_seq(&matcher_rc, "CCCGGGTTT"));
1337    }
1338
1339    #[test]
1340    fn test_rc_match_palindrome() {
1341        // ACGTACGTACGT is its own RC — pattern "ACGT" matches both forward and RC
1342        let opts_fwd = MatcherOpts {
1343            invert_match: false,
1344            reverse_complement: false,
1345        };
1346        let opts_rc = MatcherOpts {
1347            invert_match: false,
1348            reverse_complement: true,
1349        };
1350        let matcher_fwd = FixedStringMatcher::new("ACGT", opts_fwd);
1351        let matcher_rc = FixedStringMatcher::new("ACGT", opts_rc);
1352
1353        assert!(read_match_seq(&matcher_fwd, "ACGTACGTACGT"));
1354        assert!(read_match_seq(&matcher_rc, "ACGTACGTACGT"));
1355    }
1356
1357    #[test]
1358    fn test_rc_match_single_base() {
1359        // Pattern "T" does NOT match forward in AAAA, but DOES match the RC (TTTT)
1360        let opts_fwd = MatcherOpts {
1361            invert_match: false,
1362            reverse_complement: false,
1363        };
1364        let opts_rc = MatcherOpts {
1365            invert_match: false,
1366            reverse_complement: true,
1367        };
1368        let matcher_fwd = FixedStringMatcher::new("T", opts_fwd);
1369        let matcher_rc = FixedStringMatcher::new("T", opts_rc);
1370
1371        assert!(!read_match_seq(&matcher_fwd, "AAAA"));
1372        assert!(read_match_seq(&matcher_rc, "AAAA"));
1373    }
1374
1375    // ############################################################################################
1376    // Property-based tests
1377    // ############################################################################################
1378
1379    use proptest::prelude::*;
1380
1381    /// Strategy for generating valid DNA sequences
1382    fn dna_sequence() -> impl Strategy<Value = Vec<u8>> {
1383        prop::collection::vec(prop::sample::select(b"ACGT".to_vec()), 1..100)
1384    }
1385
1386    /// Strategy for generating DNA patterns
1387    fn dna_pattern() -> impl Strategy<Value = Vec<u8>> {
1388        prop::collection::vec(prop::sample::select(b"ACGT".to_vec()), 1..20)
1389    }
1390
1391    proptest! {
1392        #![proptest_config(ProptestConfig::with_cases(20))]
1393
1394        /// Property: Reverse complement is involutive - rc(rc(seq)) == seq
1395        #[test]
1396        fn prop_reverse_complement_involutive(seq in dna_sequence()) {
1397            let rc = reverse_complement(&seq);
1398            let rc_rc = reverse_complement(&rc);
1399            prop_assert_eq!(seq, rc_rc);
1400        }
1401
1402        /// Property: A sequence always matches itself as a fixed string
1403        #[test]
1404        fn prop_sequence_matches_itself(seq in dna_sequence()) {
1405            let opts = MatcherOpts {
1406                invert_match: false,
1407                reverse_complement: false,
1408            };
1409            let matcher = FixedStringMatcher::new(
1410                std::str::from_utf8(&seq).unwrap(),
1411                opts
1412            );
1413            prop_assert!(matcher.bases_match(&seq));
1414        }
1415
1416        /// Property: RC-enabled matching is a superset of forward-only matching.
1417        #[test]
1418        fn prop_rc_matching_consistent(seq in dna_sequence(), pattern in dna_pattern()) {
1419            let opts_forward = MatcherOpts {
1420                invert_match: false,
1421                reverse_complement: false,
1422            };
1423            let opts_rc = MatcherOpts {
1424                invert_match: false,
1425                reverse_complement: true,
1426            };
1427
1428            let pattern_str = std::str::from_utf8(&pattern).unwrap();
1429            let seq_str = std::str::from_utf8(&seq).unwrap();
1430            let matcher_forward = FixedStringMatcher::new(pattern_str, opts_forward);
1431            let matcher_rc = FixedStringMatcher::new(pattern_str, opts_rc);
1432
1433            let matches_forward = read_match_seq(&matcher_forward, seq_str);
1434            let matches_with_rc = read_match_seq(&matcher_rc, seq_str);
1435
1436            // If forward matches, RC-enabled must also match.
1437            // The converse is not true (RC can match when forward doesn't).
1438            prop_assert!(
1439                !matches_forward || matches_with_rc,
1440                "forward matched but RC-enabled did not"
1441            );
1442        }
1443
1444        /// Property: Invert match negates the result
1445        #[test]
1446        fn prop_invert_match_negates(seq in dna_sequence(), pattern in dna_pattern()) {
1447            let opts_normal = MatcherOpts {
1448                invert_match: false,
1449                reverse_complement: false,
1450            };
1451            let opts_inverted = MatcherOpts {
1452                invert_match: true,
1453                reverse_complement: false,
1454            };
1455
1456            let pattern_str = std::str::from_utf8(&pattern).unwrap();
1457            let matcher_normal = FixedStringMatcher::new(pattern_str, opts_normal);
1458            let matcher_inverted = FixedStringMatcher::new(pattern_str, opts_inverted);
1459
1460            let matches_normal = matcher_normal.bases_match(&seq);
1461            let matches_inverted = matcher_inverted.bases_match(&seq);
1462
1463            prop_assert_ne!(matches_normal, matches_inverted);
1464        }
1465
1466        /// Property: Fixed string matcher and regex matcher agree on literal patterns
1467        #[test]
1468        fn prop_fixed_and_regex_agree(seq in dna_sequence(), pattern in dna_pattern()) {
1469            let opts = MatcherOpts {
1470                invert_match: false,
1471                reverse_complement: false,
1472            };
1473
1474            let pattern_str = std::str::from_utf8(&pattern).unwrap();
1475            let fixed_matcher = FixedStringMatcher::new(pattern_str, opts);
1476
1477            if let Ok(regex_matcher) = RegexMatcher::new(pattern_str, opts) {
1478                let fixed_result = fixed_matcher.bases_match(&seq);
1479                let regex_result = regex_matcher.bases_match(&seq);
1480                prop_assert_eq!(fixed_result, regex_result,
1481                    "Fixed and regex matchers disagree on pattern '{}' and sequence '{}'",
1482                    pattern_str, std::str::from_utf8(&seq).unwrap()
1483                );
1484            }
1485        }
1486    }
1487}