bio_seq/codec/dna.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
//! 2-bit DNA representation: `A: 00, C: 01, G: 10, T: 11`
use crate::codec::{Codec, Complement};
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(u8)]
pub enum Dna {
A = 0b00,
C = 0b01,
G = 0b10,
T = 0b11,
}
impl Codec for Dna {
const BITS: u8 = 2;
/// Take the two least significant bits of a `u8` and map them to the
/// corresponding nucleotides.
fn unsafe_from_bits(b: u8) -> Self {
unsafe { std::mem::transmute(b & 0b11) }
}
/// We can efficient verify that a byte is a valid `Dna` value if it's
/// between 0 and 3.
fn try_from_bits(b: u8) -> Option<Self> {
if b < 4 {
Some(unsafe { std::mem::transmute::<u8, Dna>(b) })
} else {
None
}
}
/// The ASCII values of 'A', 'C', 'G', and 'T' can be translated into
/// the numbers 0, 1, 2, and 3 using bitwise operations: `((b << 1) + b) >> 3`.
fn unsafe_from_ascii(b: u8) -> Self {
Dna::unsafe_from_bits(((b << 1) + b) >> 3)
}
fn try_from_ascii(c: u8) -> Option<Self> {
match c {
b'A' => Some(Dna::A),
b'C' => Some(Dna::C),
b'G' => Some(Dna::G),
b'T' => Some(Dna::T),
_ => None,
}
}
fn to_char(self) -> char {
match self {
Dna::A => 'A',
Dna::C => 'C',
Dna::G => 'G',
Dna::T => 'T',
}
}
fn to_bits(self) -> u8 {
self as u8
}
fn items() -> impl Iterator<Item = Self> {
vec![Dna::A, Dna::C, Dna::G, Dna::T].into_iter()
}
}
impl Complement for Dna {
/// This 2-bit representation of nucleotides lends itself to a very fast
/// complement implementation with bitwise xor
fn comp(&self) -> Self {
// flip the bits
let b = *self as u8 ^ 0b11;
Dna::unsafe_from_bits(b)
}
}
#[cfg(test)]
mod tests {
use crate::prelude::*;
#[test]
fn dna_kmer_equality() {
assert_eq!(
Kmer::<Dna, 8>::try_from(dna!("TGCACATG")).unwrap(),
Kmer::<Dna, 8>::try_from(dna!("TGCACATG")).unwrap()
);
assert_ne!(
Kmer::<Dna, 7>::try_from(dna!("GTGACGA")).unwrap(),
Kmer::<Dna, 7>::try_from(dna!("GTGAAGA")).unwrap()
);
}
#[test]
fn dna_kmer_macro() {
assert_eq!(
kmer!("TGCACATG"),
Kmer::<Dna, 8>::try_from(dna!("TGCACATG")).unwrap()
);
assert_ne!(
kmer!("GTGACGA"),
Kmer::<Dna, 7>::try_from(dna!("GTGAAGA")).unwrap()
);
}
/*
#[test]
fn dna_kmer_complement() {
assert_eq!(
format!(
"{:b}",
Kmer::<Dna, 8>::try_from(dna!("AAAAAAAA"))
.unwrap()
.comp()
.bs
),
format!(
"{:b}",
Kmer::<Dna, 8>::try_from(dna!("TTTTTTTT")).unwrap().bs
)
);
assert_eq!(
Kmer::<Dna, 1>::try_from(dna!("C")).unwrap().comp(),
Kmer::<Dna, 1>::try_from(dna!("G")).unwrap()
);
assert_eq!(
Kmer::<Dna, 16>::from(dna!("AAAATGCACATGTTTT")).comp(),
Kmer::<Dna, 16>::from(dna!("TTTTACGTGTACAAAA"))
);
}
*/
}