bio_seq/codec/dna.rs
1//! 2-bit DNA representation: `A: 00, C: 01, G: 10, T: 11`
2
3use crate::codec::Codec;
4//use crate::kmer::Kmer;
5//use crate::seq::{Seq, SeqArray, SeqSlice};
6use crate::{Complement, ComplementMut};
7
8#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
9#[repr(u8)]
10pub enum Dna {
11 A = 0b00,
12 C = 0b01,
13 G = 0b10,
14 T = 0b11,
15}
16
17impl Codec for Dna {
18 const BITS: u8 = 2;
19
20 /// Transmute a `u8` into a nucleotide
21 ///
22 /// SAFETY: This only looks at the lower 2 bits of the `u8`
23 fn unsafe_from_bits(b: u8) -> Self {
24 debug_assert!(b < 4);
25 unsafe { std::mem::transmute(b & 0b11) }
26 }
27
28 /// We can verify that a byte is a valid `Dna` value if it's
29 /// between 0 and 3.
30 fn try_from_bits(b: u8) -> Option<Self> {
31 if b < 4 {
32 Some(unsafe { std::mem::transmute::<u8, Dna>(b) })
33 } else {
34 None
35 }
36 }
37
38 /// The ASCII values of 'A', 'C', 'G', and 'T' can be translated into
39 /// the numbers 0, 1, 2, and 3 using bitwise operations: `((b << 1) + b) >> 3`.
40 /// In other words, multiply the ASCII value by 3 and shift right.
41 fn unsafe_from_ascii(b: u8) -> Self {
42 // TODO: benchmark against b * 3
43 Dna::unsafe_from_bits(((b << 1) + b) >> 3)
44 }
45
46 fn try_from_ascii(c: u8) -> Option<Self> {
47 match c {
48 b'A' => Some(Dna::A),
49 b'C' => Some(Dna::C),
50 b'G' => Some(Dna::G),
51 b'T' => Some(Dna::T),
52 _ => None,
53 }
54 }
55
56 fn to_char(self) -> char {
57 match self {
58 Dna::A => 'A',
59 Dna::C => 'C',
60 Dna::G => 'G',
61 Dna::T => 'T',
62 }
63 }
64
65 fn to_bits(self) -> u8 {
66 self as u8
67 }
68
69 fn items() -> impl Iterator<Item = Self> {
70 vec![Dna::A, Dna::C, Dna::G, Dna::T].into_iter()
71 }
72}
73
74/// This 2-bit representation of nucleotides lends itself to a very fast
75/// complement implementation with bitwise xor
76impl ComplementMut for Dna {
77 fn comp(&mut self) {
78 *self = Dna::unsafe_from_bits(*self as u8 ^ 0b11);
79 }
80}
81
82impl Complement for Dna {}
83
84/*
85impl ComplementMut for Seq<Dna> {
86 fn comp(&mut self) {
87 for word in self.bv.as_raw_mut_slice() {
88 *word ^= usize::MAX;
89 }
90 }
91}
92*/
93
94/*
95impl ReverseMut for Seq<Dna> {
96 fn rev(&mut self) {
97 self.bv.reverse();
98 for word in self.bv.as_raw_mut_slice().iter_mut() {
99 let c: usize = *word;
100 let odds = (c & 0x5555_5555_5555_5555usize) >> 1;
101 *word = (c & 0xAAAA_AAAA_AAAA_AAAAusize) << 1 | odds;
102 }
103 }
104}
105*/
106
107#[cfg(test)]
108mod tests {
109 use crate::prelude::*;
110
111 #[test]
112 fn dna_kmer_equality() {
113 assert_eq!(
114 Kmer::<Dna, 8>::try_from(dna!("TGCACATG")).unwrap(),
115 Kmer::<Dna, 8>::try_from(dna!("TGCACATG")).unwrap()
116 );
117 assert_ne!(
118 Kmer::<Dna, 7>::try_from(dna!("GTGACGA")).unwrap(),
119 Kmer::<Dna, 7>::try_from(dna!("GTGAAGA")).unwrap()
120 );
121 }
122
123 #[test]
124 fn dna_kmer_macro() {
125 assert_eq!(
126 kmer!("TGCACATG"),
127 Kmer::<Dna, 8>::try_from(dna!("TGCACATG")).unwrap()
128 );
129 assert_ne!(
130 kmer!("GTGACGA"),
131 Kmer::<Dna, 7>::try_from(dna!("GTGAAGA")).unwrap()
132 );
133 }
134
135 /*
136 #[test]
137 fn dna_kmer_complement() {
138 assert_eq!(
139 format!(
140 "{:b}",
141 Kmer::<Dna, 8>::try_from(dna!("AAAAAAAA"))
142 .unwrap()
143 .comp()
144 .bs
145 ),
146 format!(
147 "{:b}",
148 Kmer::<Dna, 8>::try_from(dna!("TTTTTTTT")).unwrap().bs
149 )
150 );
151
152 assert_eq!(
153 Kmer::<Dna, 1>::try_from(dna!("C")).unwrap().comp(),
154 Kmer::<Dna, 1>::try_from(dna!("G")).unwrap()
155 );
156
157 assert_eq!(
158 Kmer::<Dna, 16>::from(dna!("AAAATGCACATGTTTT")).comp(),
159 Kmer::<Dna, 16>::from(dna!("TTTTACGTGTACAAAA"))
160 );
161 }
162 */
163}