1#![deny(unsafe_code)]
2#![allow(
3 clippy::must_use_candidate,
4 clippy::missing_panics_doc,
5 clippy::missing_errors_doc,
6 clippy::module_name_repetitions
7)]
8pub mod color;
9pub mod matcher;
10pub mod seq_io;
11use bio::data_structures::bitenc::BitEnc;
12use std::{borrow::Borrow, path::Path, sync::LazyLock};
13
14pub const IUPAC_MASKS: [u8; 256] = {
17 let mut masks = [0u8; 256];
18 let (a, c, g, t) = (1, 2, 4, 8);
19 masks[b'A' as usize] = a;
20 masks[b'C' as usize] = c;
21 masks[b'G' as usize] = g;
22 masks[b'T' as usize] = t;
23 masks[b'U' as usize] = t;
24 masks[b'M' as usize] = a | c;
25 masks[b'R' as usize] = a | g;
26 masks[b'W' as usize] = a | t;
27 masks[b'S' as usize] = c | g;
28 masks[b'Y' as usize] = c | t;
29 masks[b'K' as usize] = g | t;
30 masks[b'V' as usize] = a | c | g;
31 masks[b'H' as usize] = a | c | t;
32 masks[b'D' as usize] = a | g | t;
33 masks[b'B' as usize] = c | g | t;
34 masks[b'N' as usize] = a | c | g | t;
35 masks[b'a' as usize] = a;
37 masks[b'c' as usize] = c;
38 masks[b'g' as usize] = g;
39 masks[b't' as usize] = t;
40 masks[b'u' as usize] = t;
41 masks[b'm' as usize] = a | c;
42 masks[b'r' as usize] = a | g;
43 masks[b'w' as usize] = a | t;
44 masks[b's' as usize] = c | g;
45 masks[b'y' as usize] = c | t;
46 masks[b'k' as usize] = g | t;
47 masks[b'v' as usize] = a | c | g;
48 masks[b'h' as usize] = a | c | t;
49 masks[b'd' as usize] = a | g | t;
50 masks[b'b' as usize] = c | g | t;
51 masks[b'n' as usize] = a | c | g | t;
52 masks
53};
54
55pub const DNA_BASES: [u8; 5] = *b"ACGTN";
56pub const AMINO_ACIDS: [u8; 26] = *b"ARNDCEQGHILKMFPSTWYVBJOUXZ";
57pub const IUPAC_BASES: [u8; 15] = *b"AGCTYRWSKMDVHBN";
58pub const IUPAC_BASES_COMPLEMENT: [u8; 15] = *b"TCGARYWSMKHBDVN";
59
60pub const DNA_MASK_VALUES: [u8; 4] = [1, 2, 4, 8];
63
64pub const MAX_IUPAC_EXPANSIONS: usize = 10_000;
66
67pub fn expand_iupac_fixed_pattern(
71 pattern: &[u8],
72 pattern_index: usize,
73 prefix: &mut Vec<u8>,
74 expanded: &mut Vec<Vec<u8>>,
75) -> Result<(), String> {
76 if expanded.len() >= MAX_IUPAC_EXPANSIONS {
77 return Err(format!(
78 "IUPAC expansion exceeded {MAX_IUPAC_EXPANSIONS} patterns for input '{}'",
79 String::from_utf8_lossy(pattern)
80 ));
81 }
82 if pattern_index == pattern.len() {
83 expanded.push(prefix.clone());
84 } else {
85 let mask = IUPAC_MASKS[pattern[pattern_index] as usize];
86 for base in DNA_BASES.iter().take(4) {
87 if (IUPAC_MASKS[*base as usize] & mask) != 0 {
88 prefix.push(*base);
89 expand_iupac_fixed_pattern(pattern, pattern_index + 1, prefix, expanded)?;
90 prefix.pop();
91 }
92 }
93 }
94 Ok(())
95}
96
97pub fn expand_iupac_regex(pattern: &[u8]) -> Vec<u8> {
100 let mut new_pattern: Vec<u8> = Vec::new();
101 for base in pattern {
102 let mask = IUPAC_MASKS[*base as usize];
103 let mut bases = Vec::new();
104 for dna_base in DNA_BASES.iter().take(4) {
105 if (IUPAC_MASKS[*dna_base as usize] & mask) != 0 {
106 bases.push(*dna_base);
107 }
108 }
109 if bases.len() == 1 {
110 new_pattern.push(bases[0]);
111 } else {
112 new_pattern.push(b'[');
113 new_pattern.extend_from_slice(&bases);
114 new_pattern.push(b']');
115 }
116 }
117 new_pattern
118}
119
120pub fn encode(bases: &[u8]) -> BitEnc {
122 let mut vec = BitEnc::with_capacity(4, bases.len());
123 for base in bases {
124 vec.push(IUPAC_MASKS[*base as usize]);
125 }
126 vec
127}
128
129static COMPLEMENT: LazyLock<[u8; 256]> = LazyLock::new(|| {
130 let mut comp = [0; 256];
131 for (v, a) in comp.iter_mut().enumerate() {
132 *a = v as u8;
133 }
134 for (&a, &b) in IUPAC_BASES.iter().zip(IUPAC_BASES_COMPLEMENT.iter()) {
135 comp[a as usize] = b;
136 comp[a as usize + 32] = b + 32; }
138 comp
139});
140
141fn complement(a: u8) -> u8 {
142 COMPLEMENT[a as usize]
143}
144
145fn reverse_complement<C, T>(text: T) -> Vec<u8>
146where
147 C: Borrow<u8>,
148 T: IntoIterator<Item = C>,
149 T::IntoIter: DoubleEndedIterator,
150{
151 text.into_iter()
152 .rev()
153 .map(|a| complement(*a.borrow()))
154 .collect()
155}
156
157fn is_path_with_extension<P: AsRef<Path>>(p: &P, extensions: [&str; 2]) -> bool {
159 if let Some(ext) = p.as_ref().extension() {
160 match ext.to_str() {
161 Some(x) => extensions.contains(&x),
162 None => false,
163 }
164 } else {
165 false
166 }
167}
168
169const GZIP_EXTENSIONS: [&str; 2] = ["gz", "bgz"];
171
172pub fn is_gzip_path<P: AsRef<Path>>(p: &P) -> bool {
174 is_path_with_extension(p, GZIP_EXTENSIONS)
175}
176
177const FASTQ_EXTENSIONS: [&str; 2] = ["fastq", "fq"];
179
180pub fn is_fastq_path<P: AsRef<Path>>(p: &P) -> bool {
182 is_path_with_extension(p, FASTQ_EXTENSIONS)
183}
184
185#[cfg(test)]
187pub mod tests {
188 use crate::*;
189 use rstest::rstest;
190 use std::str;
191 use tempfile::TempDir;
192
193 #[rstest]
198 #[case("ACGT", "ACGT")] #[case("ACG", "CGT")] fn test_reverse_complement(#[case] seq: &str, #[case] expected: &str) {
201 let result = reverse_complement(seq.as_bytes());
202 let string_result = str::from_utf8(&result).unwrap();
203 assert_eq!(&string_result, &expected);
204 }
205
206 #[rstest]
211 #[case("test_fastq.fq.gz", true)] #[case("test_fastq.fq.bgz", true)] #[case("test_fastq.fq.tar", false)] fn test_is_gzip_path(#[case] file_name: &str, #[case] expected: bool) {
215 let dir = TempDir::new().unwrap();
216 let file_path = dir.path().join(file_name);
217 let result = is_gzip_path(&file_path);
218 assert_eq!(result, expected);
219 }
220 #[rstest]
225 #[case("test_fastq.fq", true)] #[case("test_fastq.fastq", true)] #[case("test_fastq.sam", false)] fn test_is_fastq_path(#[case] file_name: &str, #[case] expected: bool) {
229 let dir = TempDir::new().unwrap();
230 let file_path = dir.path().join(file_name);
231 let result = is_fastq_path(&file_path);
232 assert_eq!(result, expected);
233 }
234
235 #[rstest]
240 #[case("GATTACA", vec!["GATTACA"])]
241 #[case("GATMACA", vec!["GATAACA", "GATCACA"])]
242 #[case("GRTBANA", vec!["GATCAAA", "GATCACA", "GATCAGA", "GATCATA", "GATGAAA", "GATGACA", "GATGAGA", "GATGATA", "GATTAAA", "GATTACA", "GATTAGA", "GATTATA", "GGTCAAA", "GGTCACA", "GGTCAGA", "GGTCATA", "GGTGAAA", "GGTGACA", "GGTGAGA", "GGTGATA", "GGTTAAA", "GGTTACA", "GGTTAGA", "GGTTATA"])]
243 fn test_expand_iupac_fixed_pattern(#[case] pattern: &str, #[case] expected: Vec<&str>) {
244 let mut actual = Vec::new();
245 expand_iupac_fixed_pattern(pattern.as_bytes(), 0, &mut Vec::new(), &mut actual).unwrap();
246 let actual_strings: Vec<String> = actual
247 .iter()
248 .map(|bytes| str::from_utf8(bytes).unwrap().to_string())
249 .collect();
250 assert_eq!(actual_strings, expected);
251 }
252
253 #[rstest]
258 #[case("GATTACA", "GATTACA")]
259 #[case("GATMACA", "GAT[AC]ACA")]
260 #[case("GRTBANA", "G[AG]T[CGT]A[ACGT]A")]
261 #[case("GATUCA", "GATTCA")]
262 fn test_expand_iupac_regex(#[case] pattern: &str, #[case] expected: &str) {
263 let expanded = expand_iupac_regex(pattern.as_bytes());
264 let actual = str::from_utf8(&expanded).unwrap();
265 assert_eq!(actual, expected);
266 }
267
268 #[rstest]
273 fn test_encode() {
274 for base in IUPAC_BASES {
275 let actual: u8 = encode(&[base]).get(0).unwrap();
276 assert_eq!(actual, IUPAC_MASKS[base as usize]);
277 }
278 }
279
280 #[rstest]
285 fn test_iupac_masks_lowercase() {
286 for &base in b"ACGTUMRWSYKVHDBNacgtumrwsykvhdbn" {
287 let upper = (base as char).to_ascii_uppercase() as u8;
288 assert_eq!(
289 IUPAC_MASKS[base as usize], IUPAC_MASKS[upper as usize],
290 "Lowercase '{}' should have the same mask as uppercase '{}'",
291 base as char, upper as char
292 );
293 }
294 }
295
296 #[rstest]
297 fn test_encode_lowercase() {
298 for &base in b"acgt" {
299 let upper = (base as char).to_ascii_uppercase() as u8;
300 let lower_enc: u8 = encode(&[base]).get(0).unwrap();
301 let upper_enc: u8 = encode(&[upper]).get(0).unwrap();
302 assert_eq!(lower_enc, upper_enc);
303 }
304 }
305
306 #[rstest]
311 fn test_expand_iupac_fixed_pattern_exceeds_limit() {
312 let pattern = b"NNNNNNNN";
314 let mut expanded = Vec::new();
315 let result = expand_iupac_fixed_pattern(pattern, 0, &mut Vec::new(), &mut expanded);
316 assert!(
317 result.is_err(),
318 "Should error when expansion exceeds the limit"
319 );
320 assert!(
321 result
322 .unwrap_err()
323 .contains(&MAX_IUPAC_EXPANSIONS.to_string()),
324 "Error message should mention the limit"
325 );
326 }
327}