bio_seq/codec/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
//! Coding/Decoding trait for bit-packable enums representing sets of genomic symbols
//!
//! The [dna], [iupac], [text], and [amino] alphabets are built in.
//!
//! This trait implements the translation between the UTF-8 representation of an alphabet and its efficient bit-packing.
//! The `BITS` attribute stores the number of bits used by the representation.
//! ```
//! use bio_seq::prelude::{Dna, Codec};
//! use bio_seq::codec::text;
//! assert_eq!(Dna::BITS, 2);
//! assert_eq!(text::Dna::BITS, 8);
//! ```
//!
//! ## Deriving custom Codecs
//!
//! Custom encodings can be easily defined on enums using the derivable `Codec` trait.
//!
//! ```ignore
//! use bio_seq::prelude;
//! use bio_seq::prelude::Codec;
//!
//! #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Codec)]
//! pub enum Dna {
//! A = 0b00,
//! C = 0b01,
//! G = 0b10,
//! T = 0b11,
//! }
//! ```
//! ## Implementing custom Codecs
//!
//! Custom encodings can be defined on enums by implementing the `Codec` trait.
//!
//! ```
//! use bio_seq::prelude;
//! use bio_seq::prelude::Codec;
//!
//! #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
//! pub enum Dna {
//! A = 0b00,
//! C = 0b01,
//! G = 0b10,
//! T = 0b11,
//! }
//!
//! impl From<Dna> for u8 {
//! fn from(base: Dna) -> u8 {
//! match base {
//! Dna::A => 0b00,
//! Dna::C => 0b01,
//! Dna::G => 0b10,
//! Dna::T => 0b11,
//! }
//! }
//! }
//!
//! impl Codec for Dna {
//! const BITS: u8 = 2;
//!
//! fn unsafe_from_bits(bits: u8) -> Self {
//! if let Some(base) = Self::try_from_bits(bits) {
//! base
//! } else {
//! panic!("Unrecognised bit pattern!")
//! }
//! }
//!
//! fn try_from_bits(bits: u8) -> Option<Self> {
//! match bits {
//! 0b00 => Some(Dna::A),
//! 0b01 => Some(Dna::C),
//! 0b10 => Some(Dna::G),
//! 0b11 => Some(Dna::T),
//! _ => None,
//! }
//! }
//!
//! fn unsafe_from_ascii(chr: u8) -> Self {
//! if let Some(base) = Self::try_from_ascii(chr) {
//! base
//! } else {
//! panic!("Unrecognised bit pattern!")
//! }
//! }
//!
//! fn try_from_ascii(chr: u8) -> Option<Self> {
//! match chr {
//! b'A' => Some(Dna::A),
//! b'C' => Some(Dna::C),
//! b'G' => Some(Dna::G),
//! b'T' => Some(Dna::T),
//! _ => None,
//! }
//! }
//!
//! fn to_char(self) -> char {
//! match self {
//! Dna::A => 'A',
//! Dna::C => 'C',
//! Dna::G => 'G',
//! Dna::T => 'T',
//! }
//! }
//!
//! fn to_bits(self) -> u8 {
//! self as u8
//! }
//!
//! fn items() -> impl Iterator<Item = Self> {
//! vec![Dna::A, Dna::C, Dna::G, Dna::T].into_iter()
//! }
//! }
//!
//! ```
use core::fmt;
use core::hash::Hash;
pub mod amino;
pub mod dna;
pub mod iupac;
pub mod masked;
pub mod text;
pub use bio_seq_derive::Codec;
/// The bit encodings of an alphabet's symbols can be represented with any type.
/// Encoding from ASCII bytes and decoding the representation is implemented through
/// the `Codec` trait.
///
/// The intended representation is an `Enum`, transparently represented as a `u8`.
pub trait Codec: fmt::Debug + Copy + Clone + PartialEq + Hash + Eq {
/// The number of bits used to encode the symbols. e.g. `Dna::BITS` = 2, `Iupac::BITS` = 4.
const BITS: u8;
/// Convert raw bits of binary encoding into enum item. Binary values
/// that don't match an enum member's discriminant will result in panic or random enum
/// item
fn unsafe_from_bits(b: u8) -> Self;
/// Fallibly convert raw bits into enum. If the binary value does not
/// match a discriminant, return `None`
fn try_from_bits(b: u8) -> Option<Self>;
/// Encode an ASCII byte as a codec enum item
fn unsafe_from_ascii(c: u8) -> Self;
/// Fallibly encode an ASCII byte as a codec enum item
fn try_from_ascii(c: u8) -> Option<Self>;
/// Decode enum item as a UTF-8 character
fn to_char(self) -> char;
/// Encode as raw bits
fn to_bits(self) -> u8;
/// Iterator over the symbols of the codec
fn items() -> impl Iterator<Item = Self>;
}
/// Nucleotides and nucleotide sequences can be complemented
pub trait Complement {
/// ```
/// use bio_seq::prelude::{Dna, Complement};
/// assert_eq!(Dna::A.comp(), Dna::T);
/// ````
fn comp(&self) -> Self;
/// `Complement` a value in place
fn comp_assign(&mut self)
where
Self: Sized,
{
*self = self.comp();
}
}
#[cfg(test)]
mod tests {
use super::dna::Dna;
use super::iupac::Iupac;
#[test]
fn dna_to_iupac() {
assert_eq!(Iupac::from(Dna::A), Iupac::A);
assert_eq!(Iupac::from(Dna::C), Iupac::C);
assert_eq!(Iupac::from(Dna::G), Iupac::G);
assert_eq!(Iupac::from(Dna::T), Iupac::T);
assert_ne!(Iupac::from(Dna::A), Iupac::T);
assert_ne!(Iupac::from(Dna::T), Iupac::A);
assert_ne!(Iupac::from(Dna::C), Iupac::T);
assert_ne!(Iupac::from(Dna::G), Iupac::T);
}
}