bio_seq/codec/iupac.rs
1//! 4-bit IUPAC nucleotide ambiguity codes
2//!
3//! IUPAC nucleotide ambiguity codes are represented with 4 bits
4//!
5//! | | A | C | G | T |
6//! | - | - | - | - | - |
7//! | A | 1 | 0 | 0 | 0 |
8//! | C | 0 | 1 | 0 | 0 |
9//! | G | 0 | 0 | 1 | 0 |
10//! | T | 0 | 0 | 0 | 1 |
11//! | Y | 0 | 1 | 0 | 1 |
12//! | R | 1 | 0 | 1 | 0 |
13//! | W | 1 | 0 | 0 | 1 |
14//! | S | 0 | 1 | 1 | 0 |
15//! | K | 0 | 0 | 1 | 1 |
16//! | M | 1 | 1 | 0 | 0 |
17//! | D | 1 | 0 | 1 | 1 |
18//! | V | 1 | 1 | 1 | 0 |
19//! | H | 1 | 1 | 0 | 1 |
20//! | B | 0 | 1 | 1 | 1 |
21//! | N | 1 | 1 | 1 | 1 |
22//! | X/- | 0 | 0 | 0 | 0 |
23//!
24//! This means that we can treat each symbol as a set and we get meaningful bitwise operations:
25//!
26//! ```rust
27//! use bio_seq::prelude::*;
28//!
29//! // Set union:
30//! let union = iupac!("AS-GYTNAN") | iupac!("ANTGCAT-N");
31//! assert_eq!(union, iupac!("ANTGYWNAN"));
32//!
33//! // Set intersection:
34//! let intersection = iupac!("ACGTSWKMN") & iupac!("WKMSTNNAN");
35//! assert_eq!(intersection, iupac!("A----WKAN"));
36//! ```
37//!
38//! Which can be used to implement pattern matching:
39//!
40//! ```rust
41//! use bio_seq::prelude::*;
42//!
43//! let seq = iupac!("AGCTNNCAGTCGACGTATGTA");
44//! let pattern = iupac!("AYG");
45//!
46//! for slice in seq.windows(pattern.len()) {
47//! if pattern.contains(slice) {
48//! println!("{slice} matches pattern");
49//! }
50//! }
51//!
52//! // ACG matches pattern
53//! // ATG matches pattern
54//! ```
55use crate::codec::{Codec, dna::Dna};
56use crate::seq::{Seq, SeqArray, SeqSlice};
57use crate::{Complement, ComplementMut};
58
59const IUPAC_COMPLEMENT_TABLE: [u8; 16] = {
60 let mut table = [0; 16];
61
62 table[Iupac::A as usize] = Iupac::T as u8;
63 table[Iupac::C as usize] = Iupac::G as u8;
64 table[Iupac::G as usize] = Iupac::C as u8;
65 table[Iupac::T as usize] = Iupac::A as u8;
66 table[Iupac::Y as usize] = Iupac::R as u8;
67 table[Iupac::R as usize] = Iupac::Y as u8;
68 table[Iupac::W as usize] = Iupac::W as u8;
69 table[Iupac::S as usize] = Iupac::S as u8;
70 table[Iupac::K as usize] = Iupac::M as u8;
71 table[Iupac::M as usize] = Iupac::K as u8;
72 table[Iupac::D as usize] = Iupac::H as u8;
73 table[Iupac::V as usize] = Iupac::B as u8;
74 table[Iupac::H as usize] = Iupac::D as u8;
75 table[Iupac::B as usize] = Iupac::V as u8;
76 table[Iupac::N as usize] = Iupac::N as u8;
77
78 table
79};
80
81impl From<Iupac> for u8 {
82 fn from(b: Iupac) -> u8 {
83 b as u8
84 }
85}
86
87#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Codec)]
88#[bits(4)]
89#[repr(u8)]
90pub enum Iupac {
91 A = 0b1000,
92 C = 0b0100,
93 G = 0b0010,
94 T = 0b0001,
95 R = 0b1010,
96 Y = 0b0101,
97 S = 0b0110,
98 W = 0b1001,
99 K = 0b0011,
100 M = 0b1100,
101 B = 0b0111,
102 D = 0b1011,
103 H = 0b1101,
104 V = 0b1110,
105 N = 0b1111,
106 #[display('-')]
107 X = 0b0000,
108}
109
110impl From<Dna> for Iupac {
111 fn from(dna: Dna) -> Self {
112 match dna {
113 Dna::A => Iupac::A,
114 Dna::C => Iupac::C,
115 Dna::G => Iupac::G,
116 Dna::T => Iupac::T,
117 }
118 }
119}
120
121impl Seq<Iupac> {
122 pub fn contains(&self, rhs: &SeqSlice<Iupac>) -> bool {
123 if rhs.len() != self.len() {
124 return false;
125 }
126
127 self.as_ref() & rhs == rhs
128 }
129}
130
131impl<const N: usize, const W: usize> SeqArray<Iupac, N, W> {
132 pub fn contains(&self, rhs: &SeqSlice<Iupac>) -> bool {
133 if N != rhs.len() {
134 return false;
135 }
136 self.as_ref() & rhs == rhs
137 }
138}
139
140impl SeqSlice<Iupac> {
141 pub fn contains(&self, rhs: &SeqSlice<Iupac>) -> bool {
142 if self.len() != rhs.len() {
143 return false;
144 }
145 self & rhs == rhs
146 }
147}
148
149/// The complement of an IUPAC base is the reverse of the bit-pattern
150impl ComplementMut for Iupac {
151 fn comp(&mut self) {
152 // Below are two methods:
153 // 1. Using a lookup table.
154 // 2. The 7 operation from https://graphics.stanford.edu/~seander/bithacks.html
155 // See: https://stackoverflow.com/questions/3587826/is-there-a-built-in-function-to-reverse-bit-order
156
157 // Use a lookup table
158 *self = Iupac::unsafe_from_bits(IUPAC_COMPLEMENT_TABLE[*self as usize]);
159
160 // Use the The 7 operation from
161 // let b = *self as u32;
162 // *self = Iupac::unsafe_from_bits(
163 // ((((((b * 0x0802u32) & 0x22110u32) | ((b * 0x8020u32) & 0x88440u32)) * 0x10101u32)
164 // >> 20) // 16 + 4
165 // & 0x0f) as u8,
166 // );
167 }
168}
169
170impl Complement for Iupac {}
171
172/*
173impl Complement for Seq<Iupac> {
174 type Output = Self;
175
176 fn comp(&mut self) {
177 todo!()
178 }
179
180 fn to_comp(&self) -> Self::Output {
181 todo!()
182 }
183}
184
185/// Reverse complementing a sequence of IUPAC characters is simply a
186/// matter of reversing the entire bit sequence
187impl ReverseComplement for Seq<Iupac> {
188 type Output = Self;
189
190 fn revcomp(&mut self) {
191 // simply do bit-wise reversal
192 self.bv.reverse();
193 }
194
195 fn to_revcomp(&self) -> Self::Output {
196 todo!()
197 }
198}
199*/
200
201#[cfg(test)]
202mod tests {
203 use crate::prelude::*;
204
205 #[test]
206 fn iupac_ops() {
207 let seq = iupac!("AGCTNNCAGTCGACGTATGTA");
208
209 let pattern = iupac!("AYG");
210
211 let matches: Vec<Seq<Iupac>> = seq
212 .windows(pattern.len())
213 .filter(|w| pattern.contains(w))
214 .collect();
215
216 assert_eq!(matches, vec![iupac!("ACG"), iupac!("ATG")]);
217 }
218
219 #[test]
220 fn iupac_complement() {
221 assert_eq!(
222 iupac!("AGCTYRWSKMDVHBN").to_comp(),
223 iupac!("TCGARYWSMKHBDVN")
224 );
225 }
226}