Skip to main content

nanalogue_core/utils/
complement.rs

1//! DNA complement and reverse-complement helpers.
2//!
3//! This module includes code adapted from the `bio` crate's DNA alphabet
4//! utilities. See `THIRD_PARTY_NOTICES.md` for the upstream license text.
5
6use std::array;
7use std::borrow::Borrow;
8use std::sync::LazyLock;
9
10/// Complement lookup table adapted from the `bio` crate's DNA alphabet helpers.
11static COMPLEMENT: LazyLock<[u8; 256]> = LazyLock::new(|| {
12    let mut comp = array::from_fn(|v| u8::try_from(v).expect("array indices 0..=255 fit in u8"));
13    b"AGCTYRWSKMDVHBN"
14        .iter()
15        .zip(b"TCGARYWSMKHBDVN".iter())
16        .for_each(|(&a, &b)| {
17            *comp
18                .get_mut(a as usize)
19                .expect("ASCII nucleotide bytes are valid lookup indices") = b;
20            *comp
21                .get_mut(a as usize + 32)
22                .expect("ASCII lowercase nucleotide bytes are valid lookup indices") = b + 32;
23        });
24    comp
25});
26
27/// Return the DNA complement of a byte while preserving `bio`'s permissive behavior.
28///
29/// Unknown bytes are returned unchanged. IUPAC ambiguity codes and lowercase
30/// bases are supported.
31///
32/// # Panics
33///
34/// Panics only if a `u8` fails to index the 256-byte lookup table, which is
35/// impossible.
36#[inline]
37#[must_use]
38pub fn complement(a: u8) -> u8 {
39    COMPLEMENT
40        .get(a as usize)
41        .copied()
42        .expect("u8 values always index into the 256-byte complement table")
43}
44
45/// Return the reverse complement of an input byte sequence.
46///
47/// The fast path (`.collect()`) is only taken when the iterator's reported
48/// upper-bound length is within a safe pre-reservation budget. Larger
49/// reported lengths are refused and return an empty `Vec` instead.
50#[must_use]
51pub fn revcomp<C, T>(text: T) -> Vec<u8>
52where
53    C: Borrow<u8>,
54    T: IntoIterator<Item = C>,
55    T::IntoIter: DoubleEndedIterator,
56{
57    /// Upper bound on the size-hint we are willing to pre-allocate (3 GiB).
58    /// Anything reporting more is refused.
59    const MAX_PRE_RESERVE: usize = 3usize * 1024 * 1024 * 1024;
60
61    let iter = text.into_iter().rev().map(|a| complement(*a.borrow()));
62    let (lo, hi) = iter.size_hint();
63    let upper = hi.unwrap_or(lo);
64
65    if upper <= MAX_PRE_RESERVE {
66        iter.collect()
67    } else {
68        Vec::new()
69    }
70}
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75
76    #[test]
77    fn complement_preserves_iupac_behavior() {
78        assert_eq!(complement(b'A'), b'T');
79        assert_eq!(complement(b'c'), b'g');
80        assert_eq!(complement(b'N'), b'N');
81        assert_eq!(complement(b'Y'), b'R');
82        assert_eq!(complement(b's'), b's');
83        assert_eq!(complement(b'='), b'=');
84        assert_eq!(complement(b'X'), b'X');
85    }
86
87    #[test]
88    fn complement_table_has_all_u8_entries() {
89        let table = &*COMPLEMENT;
90        assert_eq!(table.len(), usize::from(u8::MAX) + 1);
91
92        for byte in u8::MIN..=u8::MAX {
93            assert!(table.get(usize::from(byte)).is_some());
94        }
95    }
96
97    #[test]
98    fn revcomp_preserves_iupac_behavior() {
99        assert_eq!(revcomp(b"ACGTN"), b"NACGT");
100        assert_eq!(revcomp(b"GaTtaCA"), b"TGtaAtC");
101        assert_eq!(revcomp(b"AGCTYRWSKMDVHBN"), b"NVDBHKMSWYRAGCT");
102    }
103
104    /// Iterators with `size_hint` exceeding the pre-reserve budget return an
105    /// empty `Vec`.
106    #[test]
107    fn revcomp_does_not_abort_on_adversarial_size_hint() {
108        let it = std::iter::repeat_n(0u8, usize::MAX);
109        let out = revcomp(it);
110        assert!(
111            out.is_empty(),
112            "adversarial size_hint must produce an empty Vec, got {}",
113            out.len(),
114        );
115    }
116}