Skip to main content

fqgrep_lib/
mod.rs

1#![deny(unsafe_code)]
2#![allow(
3    clippy::must_use_candidate,
4    clippy::missing_panics_doc,
5    clippy::missing_errors_doc,
6    clippy::module_name_repetitions
7)]
8pub mod color;
9pub mod matcher;
10pub mod seq_io;
11use bio::data_structures::bitenc::BitEnc;
12use std::{borrow::Borrow, path::Path, sync::LazyLock};
13
14/// Lookup table mapping ASCII byte values to their 4-bit IUPAC bitmask encodings.
15/// A=1, C=2, G=4, T=8, and ambiguity codes are the bitwise OR of their constituents.
16pub const IUPAC_MASKS: [u8; 256] = {
17    let mut masks = [0u8; 256];
18    let (a, c, g, t) = (1, 2, 4, 8);
19    masks[b'A' as usize] = a;
20    masks[b'C' as usize] = c;
21    masks[b'G' as usize] = g;
22    masks[b'T' as usize] = t;
23    masks[b'U' as usize] = t;
24    masks[b'M' as usize] = a | c;
25    masks[b'R' as usize] = a | g;
26    masks[b'W' as usize] = a | t;
27    masks[b'S' as usize] = c | g;
28    masks[b'Y' as usize] = c | t;
29    masks[b'K' as usize] = g | t;
30    masks[b'V' as usize] = a | c | g;
31    masks[b'H' as usize] = a | c | t;
32    masks[b'D' as usize] = a | g | t;
33    masks[b'B' as usize] = c | g | t;
34    masks[b'N' as usize] = a | c | g | t;
35    // Lowercase mappings
36    masks[b'a' as usize] = a;
37    masks[b'c' as usize] = c;
38    masks[b'g' as usize] = g;
39    masks[b't' as usize] = t;
40    masks[b'u' as usize] = t;
41    masks[b'm' as usize] = a | c;
42    masks[b'r' as usize] = a | g;
43    masks[b'w' as usize] = a | t;
44    masks[b's' as usize] = c | g;
45    masks[b'y' as usize] = c | t;
46    masks[b'k' as usize] = g | t;
47    masks[b'v' as usize] = a | c | g;
48    masks[b'h' as usize] = a | c | t;
49    masks[b'd' as usize] = a | g | t;
50    masks[b'b' as usize] = c | g | t;
51    masks[b'n' as usize] = a | c | g | t;
52    masks
53};
54
55pub const DNA_BASES: [u8; 5] = *b"ACGTN";
56pub const AMINO_ACIDS: [u8; 26] = *b"ARNDCEQGHILKMFPSTWYVBJOUXZ";
57pub const IUPAC_BASES: [u8; 15] = *b"AGCTYRWSKMDVHBN";
58pub const IUPAC_BASES_COMPLEMENT: [u8; 15] = *b"TCGARYWSMKHBDVN";
59
60/// The encoded 4-bit mask values for the four standard DNA bases (A=1, C=2, G=4, T=8).
61/// Used to distinguish DNA-only patterns from those containing IUPAC ambiguity codes.
62pub const DNA_MASK_VALUES: [u8; 4] = [1, 2, 4, 8];
63
64/// The maximum number of expanded patterns allowed before returning an error.
65pub const MAX_IUPAC_EXPANSIONS: usize = 10_000;
66
67/// Expands the pattern containing IUPAC bases into one or more patterns.
68///
69/// Returns an error if the expansion would exceed [`MAX_IUPAC_EXPANSIONS`] patterns.
70pub fn expand_iupac_fixed_pattern(
71    pattern: &[u8],
72    pattern_index: usize,
73    prefix: &mut Vec<u8>,
74    expanded: &mut Vec<Vec<u8>>,
75) -> Result<(), String> {
76    if expanded.len() >= MAX_IUPAC_EXPANSIONS {
77        return Err(format!(
78            "IUPAC expansion exceeded {MAX_IUPAC_EXPANSIONS} patterns for input '{}'",
79            String::from_utf8_lossy(pattern)
80        ));
81    }
82    if pattern_index == pattern.len() {
83        expanded.push(prefix.clone());
84    } else {
85        let mask = IUPAC_MASKS[pattern[pattern_index] as usize];
86        for base in DNA_BASES.iter().take(4) {
87            if (IUPAC_MASKS[*base as usize] & mask) != 0 {
88                prefix.push(*base);
89                expand_iupac_fixed_pattern(pattern, pattern_index + 1, prefix, expanded)?;
90                prefix.pop();
91            }
92        }
93    }
94    Ok(())
95}
96
97/// Converts an IUPAC pattern into a regex pattern by replacing each ambiguity code with a
98/// character class of its constituent DNA bases (e.g., `R` becomes `[AG]`).
99pub fn expand_iupac_regex(pattern: &[u8]) -> Vec<u8> {
100    let mut new_pattern: Vec<u8> = Vec::new();
101    for base in pattern {
102        let mask = IUPAC_MASKS[*base as usize];
103        let mut bases = Vec::new();
104        for dna_base in DNA_BASES.iter().take(4) {
105            if (IUPAC_MASKS[*dna_base as usize] & mask) != 0 {
106                bases.push(*dna_base);
107            }
108        }
109        if bases.len() == 1 {
110            new_pattern.push(bases[0]);
111        } else {
112            new_pattern.push(b'[');
113            new_pattern.extend_from_slice(&bases);
114            new_pattern.push(b']');
115        }
116    }
117    new_pattern
118}
119
120/// Encodes a sequence of ASCII bases into a `BitEnc` using 4-bit IUPAC mask values.
121pub fn encode(bases: &[u8]) -> BitEnc {
122    let mut vec = BitEnc::with_capacity(4, bases.len());
123    for base in bases {
124        vec.push(IUPAC_MASKS[*base as usize]);
125    }
126    vec
127}
128
129static COMPLEMENT: LazyLock<[u8; 256]> = LazyLock::new(|| {
130    let mut comp = [0; 256];
131    for (v, a) in comp.iter_mut().enumerate() {
132        *a = v as u8;
133    }
134    for (&a, &b) in IUPAC_BASES.iter().zip(IUPAC_BASES_COMPLEMENT.iter()) {
135        comp[a as usize] = b;
136        comp[a as usize + 32] = b + 32; // lowercase variants
137    }
138    comp
139});
140
141fn complement(a: u8) -> u8 {
142    COMPLEMENT[a as usize]
143}
144
145fn reverse_complement<C, T>(text: T) -> Vec<u8>
146where
147    C: Borrow<u8>,
148    T: IntoIterator<Item = C>,
149    T::IntoIter: DoubleEndedIterator,
150{
151    text.into_iter()
152        .rev()
153        .map(|a| complement(*a.borrow()))
154        .collect()
155}
156
157/// Returns true if the path ends with a recognized GZIP file extension
158fn is_path_with_extension<P: AsRef<Path>>(p: &P, extensions: [&str; 2]) -> bool {
159    if let Some(ext) = p.as_ref().extension() {
160        match ext.to_str() {
161            Some(x) => extensions.contains(&x),
162            None => false,
163        }
164    } else {
165        false
166    }
167}
168
169/// The set of file extensions to treat as GZIPPED
170const GZIP_EXTENSIONS: [&str; 2] = ["gz", "bgz"];
171
172/// Returns true if the path ends with a recognized GZIP file extension
173pub fn is_gzip_path<P: AsRef<Path>>(p: &P) -> bool {
174    is_path_with_extension(p, GZIP_EXTENSIONS)
175}
176
177/// The set of file extensions to treat as FASTQ
178const FASTQ_EXTENSIONS: [&str; 2] = ["fastq", "fq"];
179
180/// Returns true if the path ends with a recognized FASTQ file extension
181pub fn is_fastq_path<P: AsRef<Path>>(p: &P) -> bool {
182    is_path_with_extension(p, FASTQ_EXTENSIONS)
183}
184
185// Tests
186#[cfg(test)]
187pub mod tests {
188    use crate::*;
189    use rstest::rstest;
190    use std::str;
191    use tempfile::TempDir;
192
193    // ############################################################################################
194    // Tests reverse_complement()
195    // ############################################################################################
196
197    #[rstest]
198    #[case("ACGT", "ACGT")] // Reverse complement with even length string
199    #[case("ACG", "CGT")] // Reverse complement with odd length string (tests for off by one error)
200    fn test_reverse_complement(#[case] seq: &str, #[case] expected: &str) {
201        let result = reverse_complement(seq.as_bytes());
202        let string_result = str::from_utf8(&result).unwrap();
203        assert_eq!(&string_result, &expected);
204    }
205
206    // ############################################################################################
207    // Tests is_gzip_path()
208    // ############################################################################################
209
210    #[rstest]
211    #[case("test_fastq.fq.gz", true)] // .fq.gz is valid gzip
212    #[case("test_fastq.fq.bgz", true)] // .fq.bgz is valid gzip
213    #[case("test_fastq.fq.tar", false)] // .fq.tar is invalid gzip
214    fn test_is_gzip_path(#[case] file_name: &str, #[case] expected: bool) {
215        let dir = TempDir::new().unwrap();
216        let file_path = dir.path().join(file_name);
217        let result = is_gzip_path(&file_path);
218        assert_eq!(result, expected);
219    }
220    // ############################################################################################
221    // Tests is_fastq_path()
222    // ############################################################################################
223
224    #[rstest]
225    #[case("test_fastq.fq", true)] // .fq is valid fastq
226    #[case("test_fastq.fastq", true)] // .fastq is valid fastq
227    #[case("test_fastq.sam", false)] // .sam is invalid fastq
228    fn test_is_fastq_path(#[case] file_name: &str, #[case] expected: bool) {
229        let dir = TempDir::new().unwrap();
230        let file_path = dir.path().join(file_name);
231        let result = is_fastq_path(&file_path);
232        assert_eq!(result, expected);
233    }
234
235    // ############################################################################################
236    // Tests expand_iupac_fixed_pattern()
237    // ############################################################################################
238
239    #[rstest]
240    #[case("GATTACA", vec!["GATTACA"])]
241    #[case("GATMACA", vec!["GATAACA", "GATCACA"])]
242    #[case("GRTBANA", vec!["GATCAAA", "GATCACA", "GATCAGA", "GATCATA", "GATGAAA", "GATGACA", "GATGAGA", "GATGATA", "GATTAAA", "GATTACA", "GATTAGA", "GATTATA", "GGTCAAA", "GGTCACA", "GGTCAGA", "GGTCATA", "GGTGAAA", "GGTGACA", "GGTGAGA", "GGTGATA", "GGTTAAA", "GGTTACA", "GGTTAGA", "GGTTATA"])]
243    fn test_expand_iupac_fixed_pattern(#[case] pattern: &str, #[case] expected: Vec<&str>) {
244        let mut actual = Vec::new();
245        expand_iupac_fixed_pattern(pattern.as_bytes(), 0, &mut Vec::new(), &mut actual).unwrap();
246        let actual_strings: Vec<String> = actual
247            .iter()
248            .map(|bytes| str::from_utf8(bytes).unwrap().to_string())
249            .collect();
250        assert_eq!(actual_strings, expected);
251    }
252
253    // ############################################################################################
254    // Tests expand_iupac_regex()
255    // ############################################################################################
256
257    #[rstest]
258    #[case("GATTACA", "GATTACA")]
259    #[case("GATMACA", "GAT[AC]ACA")]
260    #[case("GRTBANA", "G[AG]T[CGT]A[ACGT]A")]
261    #[case("GATUCA", "GATTCA")]
262    fn test_expand_iupac_regex(#[case] pattern: &str, #[case] expected: &str) {
263        let expanded = expand_iupac_regex(pattern.as_bytes());
264        let actual = str::from_utf8(&expanded).unwrap();
265        assert_eq!(actual, expected);
266    }
267
268    // ############################################################################################
269    // Tests encode()
270    // ############################################################################################
271
272    #[rstest]
273    fn test_encode() {
274        for base in IUPAC_BASES {
275            let actual: u8 = encode(&[base]).get(0).unwrap();
276            assert_eq!(actual, IUPAC_MASKS[base as usize]);
277        }
278    }
279
280    // ############################################################################################
281    // Tests lowercase IUPAC masks
282    // ############################################################################################
283
284    #[rstest]
285    fn test_iupac_masks_lowercase() {
286        for &base in b"ACGTUMRWSYKVHDBNacgtumrwsykvhdbn" {
287            let upper = (base as char).to_ascii_uppercase() as u8;
288            assert_eq!(
289                IUPAC_MASKS[base as usize], IUPAC_MASKS[upper as usize],
290                "Lowercase '{}' should have the same mask as uppercase '{}'",
291                base as char, upper as char
292            );
293        }
294    }
295
296    #[rstest]
297    fn test_encode_lowercase() {
298        for &base in b"acgt" {
299            let upper = (base as char).to_ascii_uppercase() as u8;
300            let lower_enc: u8 = encode(&[base]).get(0).unwrap();
301            let upper_enc: u8 = encode(&[upper]).get(0).unwrap();
302            assert_eq!(lower_enc, upper_enc);
303        }
304    }
305
306    // ############################################################################################
307    // Tests expand_iupac_fixed_pattern expansion limit
308    // ############################################################################################
309
310    #[rstest]
311    fn test_expand_iupac_fixed_pattern_exceeds_limit() {
312        // 'N' expands to 4 bases; N repeated 8 times = 4^8 = 65536 > MAX_IUPAC_EXPANSIONS
313        let pattern = b"NNNNNNNN";
314        let mut expanded = Vec::new();
315        let result = expand_iupac_fixed_pattern(pattern, 0, &mut Vec::new(), &mut expanded);
316        assert!(
317            result.is_err(),
318            "Should error when expansion exceeds the limit"
319        );
320        assert!(
321            result
322                .unwrap_err()
323                .contains(&MAX_IUPAC_EXPANSIONS.to_string()),
324            "Error message should mention the limit"
325        );
326    }
327}