bio_seq/codec/
iupac.rs

1//! 4-bit IUPAC nucleotide ambiguity codes
2//!
3//! IUPAC nucleotide ambiguity codes are represented with 4 bits
4//!
5//! |   | A | C | G | T |
6//! | - | - | - | - | - |
7//! | A | 1 | 0 | 0 | 0 |
8//! | C | 0 | 1 | 0 | 0 |
9//! | G | 0 | 0 | 1 | 0 |
10//! | T | 0 | 0 | 0 | 1 |
11//! | Y | 0 | 1 | 0 | 1 |
12//! | R | 1 | 0 | 1 | 0 |
13//! | W | 1 | 0 | 0 | 1 |
14//! | S | 0 | 1 | 1 | 0 |
15//! | K | 0 | 0 | 1 | 1 |
16//! | M | 1 | 1 | 0 | 0 |
17//! | D | 1 | 0 | 1 | 1 |
18//! | V | 1 | 1 | 1 | 0 |
19//! | H | 1 | 1 | 0 | 1 |
20//! | B | 0 | 1 | 1 | 1 |
21//! | N | 1 | 1 | 1 | 1 |
22//! | X/- | 0 | 0 | 0 | 0 |
23//!
24//! This means that we can treat each symbol as a set and we get meaningful bitwise operations:
25//!
26//! ```rust
27//! use bio_seq::prelude::*;
28//!
29//! // Set union:
30//! let union = iupac!("AS-GYTNAN") | iupac!("ANTGCAT-N");
31//! assert_eq!(union, iupac!("ANTGYWNAN"));
32//!
33//! // Set intersection:
34//! let intersection = iupac!("ACGTSWKMN") & iupac!("WKMSTNNAN");
35//! assert_eq!(intersection, iupac!("A----WKAN"));
36//! ```
37//!
38//! Which can be used to implement pattern matching:
39//!
40//! ```rust
41//! use bio_seq::prelude::*;
42//!
43//! let seq = iupac!("AGCTNNCAGTCGACGTATGTA");
44//! let pattern = iupac!("AYG");
45//!
46//! for slice in seq.windows(pattern.len()) {
47//!    if pattern.contains(slice) {
48//!        println!("{slice} matches pattern");
49//!    }
50//! }
51//!
52//! // ACG matches pattern
53//! // ATG matches pattern
54//! ```
55use crate::codec::{Codec, dna::Dna};
56use crate::seq::{Seq, SeqArray, SeqSlice};
57use crate::{Complement, ComplementMut};
58
59const IUPAC_COMPLEMENT_TABLE: [u8; 16] = {
60    let mut table = [0; 16];
61
62    table[Iupac::A as usize] = Iupac::T as u8;
63    table[Iupac::C as usize] = Iupac::G as u8;
64    table[Iupac::G as usize] = Iupac::C as u8;
65    table[Iupac::T as usize] = Iupac::A as u8;
66    table[Iupac::Y as usize] = Iupac::R as u8;
67    table[Iupac::R as usize] = Iupac::Y as u8;
68    table[Iupac::W as usize] = Iupac::W as u8;
69    table[Iupac::S as usize] = Iupac::S as u8;
70    table[Iupac::K as usize] = Iupac::M as u8;
71    table[Iupac::M as usize] = Iupac::K as u8;
72    table[Iupac::D as usize] = Iupac::H as u8;
73    table[Iupac::V as usize] = Iupac::B as u8;
74    table[Iupac::H as usize] = Iupac::D as u8;
75    table[Iupac::B as usize] = Iupac::V as u8;
76    table[Iupac::N as usize] = Iupac::N as u8;
77
78    table
79};
80
81impl From<Iupac> for u8 {
82    fn from(b: Iupac) -> u8 {
83        b as u8
84    }
85}
86
87#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Codec)]
88#[bits(4)]
89#[repr(u8)]
90pub enum Iupac {
91    A = 0b1000,
92    C = 0b0100,
93    G = 0b0010,
94    T = 0b0001,
95    R = 0b1010,
96    Y = 0b0101,
97    S = 0b0110,
98    W = 0b1001,
99    K = 0b0011,
100    M = 0b1100,
101    B = 0b0111,
102    D = 0b1011,
103    H = 0b1101,
104    V = 0b1110,
105    N = 0b1111,
106    #[display('-')]
107    X = 0b0000,
108}
109
110impl From<Dna> for Iupac {
111    fn from(dna: Dna) -> Self {
112        match dna {
113            Dna::A => Iupac::A,
114            Dna::C => Iupac::C,
115            Dna::G => Iupac::G,
116            Dna::T => Iupac::T,
117        }
118    }
119}
120
121impl Seq<Iupac> {
122    pub fn contains(&self, rhs: &SeqSlice<Iupac>) -> bool {
123        if rhs.len() != self.len() {
124            return false;
125        }
126
127        self.as_ref() & rhs == rhs
128    }
129}
130
131impl<const N: usize, const W: usize> SeqArray<Iupac, N, W> {
132    pub fn contains(&self, rhs: &SeqSlice<Iupac>) -> bool {
133        if N != rhs.len() {
134            return false;
135        }
136        self.as_ref() & rhs == rhs
137    }
138}
139
140impl SeqSlice<Iupac> {
141    pub fn contains(&self, rhs: &SeqSlice<Iupac>) -> bool {
142        if self.len() != rhs.len() {
143            return false;
144        }
145        self & rhs == rhs
146    }
147}
148
149/// The complement of an IUPAC base is the reverse of the bit-pattern
150impl ComplementMut for Iupac {
151    fn comp(&mut self) {
152        // Below are two methods:
153        // 1. Using a lookup table.
154        // 2. The 7 operation from https://graphics.stanford.edu/~seander/bithacks.html
155        // See: https://stackoverflow.com/questions/3587826/is-there-a-built-in-function-to-reverse-bit-order
156
157        // Use a lookup table
158        *self = Iupac::unsafe_from_bits(IUPAC_COMPLEMENT_TABLE[*self as usize]);
159
160        // Use the The 7 operation from
161        // let b = *self as u32;
162        // *self = Iupac::unsafe_from_bits(
163        //     ((((((b * 0x0802u32) & 0x22110u32) | ((b * 0x8020u32) & 0x88440u32)) * 0x10101u32)
164        //         >> 20) // 16 + 4
165        //         & 0x0f) as u8,
166        // );
167    }
168}
169
170impl Complement for Iupac {}
171
172/*
173impl Complement for Seq<Iupac> {
174    type Output = Self;
175
176    fn comp(&mut self) {
177        todo!()
178    }
179
180    fn to_comp(&self) -> Self::Output {
181        todo!()
182    }
183}
184
185/// Reverse complementing a sequence of IUPAC characters is simply a
186/// matter of reversing the entire bit sequence
187impl ReverseComplement for Seq<Iupac> {
188    type Output = Self;
189
190    fn revcomp(&mut self) {
191        // simply do bit-wise reversal
192        self.bv.reverse();
193    }
194
195    fn to_revcomp(&self) -> Self::Output {
196        todo!()
197    }
198}
199*/
200
201#[cfg(test)]
202mod tests {
203    use crate::prelude::*;
204
205    #[test]
206    fn iupac_ops() {
207        let seq = iupac!("AGCTNNCAGTCGACGTATGTA");
208
209        let pattern = iupac!("AYG");
210
211        let matches: Vec<Seq<Iupac>> = seq
212            .windows(pattern.len())
213            .filter(|w| pattern.contains(w))
214            .collect();
215
216        assert_eq!(matches, vec![iupac!("ACG"), iupac!("ATG")]);
217    }
218
219    #[test]
220    fn iupac_complement() {
221        assert_eq!(
222            iupac!("AGCTYRWSKMDVHBN").to_comp(),
223            iupac!("TCGARYWSMKHBDVN")
224        );
225    }
226}