bio/alphabets/
dna.rs

1// Copyright 2014-2015 Johannes Köster, Peer Aramillo Irizar.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Implementation of the DNA alphabet.
7//!
8//! # Example
9//!
10//! ```
11//! use bio::alphabets;
12//! let alphabet = alphabets::dna::alphabet();
13//! assert!(alphabet.is_word(b"GATTACA"));
14//! assert!(alphabet.is_word(b"gattaca"));
15//! assert!(!alphabet.is_word(b"ACGU"));
16//! ```
17//!
18use crate::alphabets::Alphabet;
19use std::borrow::Borrow;
20use std::sync::LazyLock;
21
22/// The DNA alphabet (uppercase and lowercase).
23pub fn alphabet() -> Alphabet {
24    Alphabet::new(b"ACGTacgt")
25}
26
27/// The DNA alphabet including N (uppercase and lowercase).
28pub fn n_alphabet() -> Alphabet {
29    Alphabet::new(b"ACGTNacgtn")
30}
31
32/// The IUPAC DNA alphabet (uppercase and lowercase).
33pub fn iupac_alphabet() -> Alphabet {
34    Alphabet::new(b"ACGTRYSWKMBDHVNZacgtryswkmbdhvnz")
35}
36
37static COMPLEMENT: LazyLock<[u8; 256]> = LazyLock::new(|| {
38    let mut comp = [0; 256];
39    comp.iter_mut().enumerate().for_each(|(v, a)| {
40        *a = v as u8;
41    });
42    b"AGCTYRWSKMDVHBN"
43        .iter()
44        .zip(b"TCGARYWSMKHBDVN".iter())
45        .for_each(|(&a, &b)| {
46            comp[a as usize] = b;
47            comp[a as usize + 32] = b + 32;
48        });
49    comp
50});
51
52/// Return complement of given DNA alphabet character (IUPAC alphabet supported).
53///
54/// Casing of input character is preserved, e.g. `t` → `a`, but `T` → `A`.
55/// All `N`s remain as they are.
56///
57/// ```
58/// use bio::alphabets::dna;
59///
60/// assert_eq!(dna::complement(65), 84); // A → T
61/// assert_eq!(dna::complement(99), 103); // c → g
62/// assert_eq!(dna::complement(78), 78); // N → N
63/// assert_eq!(dna::complement(89), 82); // Y → R
64/// assert_eq!(dna::complement(115), 115); // s → s
65/// ```
66#[inline]
67pub fn complement(a: u8) -> u8 {
68    COMPLEMENT[a as usize]
69}
70
71/// Calculate reverse complement of given text (IUPAC alphabet supported).
72///
73/// Casing of characters is preserved, e.g. `b"NaCgT"` → `b"aCgTN"`.
74/// All `N`s remain as they are.
75///
76/// ```
77/// use bio::alphabets::dna;
78///
79/// assert_eq!(dna::revcomp(b"ACGTN"), b"NACGT");
80/// assert_eq!(dna::revcomp(b"GaTtaCA"), b"TGtaAtC");
81/// assert_eq!(dna::revcomp(b"AGCTYRWSKMDVHBN"), b"NVDBHKMSWYRAGCT");
82/// ```
83pub fn revcomp<C, T>(text: T) -> Vec<u8>
84where
85    C: Borrow<u8>,
86    T: IntoIterator<Item = C>,
87    T::IntoIter: DoubleEndedIterator,
88{
89    text.into_iter()
90        .rev()
91        .map(|a| complement(*a.borrow()))
92        .collect()
93}
94
95#[cfg(test)]
96mod tests {
97    use super::*;
98
99    #[test]
100    fn is_word() {
101        assert!(alphabet().is_word(b"GATTACA"));
102    }
103
104    #[test]
105    fn is_no_word() {
106        assert!(!alphabet().is_word(b"gaUUaca"));
107    }
108
109    #[test]
110    fn symbol_is_no_word() {
111        assert!(!alphabet().is_word(b"#"));
112    }
113
114    #[test]
115    fn number_is_no_word() {
116        assert!(!alphabet().is_word(b"42"));
117    }
118}