bio_seq/codec.rs
1//! Coding/Decoding trait for bit-packable enums representing sets of genomic symbols
2//!
3//! The [dna], [iupac], [text], and [amino] alphabets are built in.
4//!
5//! This trait implements the translation between the UTF-8 representation of an alphabet and its efficient bit-packing.
6//! The `BITS` attribute stores the number of bits used by the representation.
7//! ```
8//! use bio_seq::prelude::{Dna, Codec};
9//! use bio_seq::codec::text;
10//! assert_eq!(Dna::BITS, 2);
11//! assert_eq!(text::Dna::BITS, 8);
12//! ```
13//!
14//! ## Deriving custom Codecs
15//!
16//! Custom encodings can be easily defined on enums using the derivable `Codec` trait.
17//!
18//! ```ignore
19//! use bio_seq::prelude;
20//! use bio_seq::prelude::Codec;
21//!
22//! #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Codec)]
23//! pub enum Dna {
24//! A = 0b00,
25//! C = 0b01,
26//! G = 0b10,
27//! T = 0b11,
28//! }
29//! ```
30//! ## Implementing custom Codecs
31//!
32//! Custom encodings can be defined on enums by implementing the `Codec` trait.
33//!
34//! ```
35//! use bio_seq::prelude;
36//! use bio_seq::prelude::Codec;
37//!
38//! #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
39//! pub enum Dna {
40//! A = 0b00,
41//! C = 0b01,
42//! G = 0b10,
43//! T = 0b11,
44//! }
45//!
46//! impl From<Dna> for u8 {
47//! fn from(base: Dna) -> u8 {
48//! match base {
49//! Dna::A => 0b00,
50//! Dna::C => 0b01,
51//! Dna::G => 0b10,
52//! Dna::T => 0b11,
53//! }
54//! }
55//! }
56//!
57//! impl Codec for Dna {
58//! const BITS: u8 = 2;
59//!
60//! fn unsafe_from_bits(bits: u8) -> Self {
61//! if let Some(base) = Self::try_from_bits(bits) {
62//! base
63//! } else {
64//! panic!("Unrecognised bit pattern!")
65//! }
66//! }
67//!
68//! fn try_from_bits(bits: u8) -> Option<Self> {
69//! match bits {
70//! 0b00 => Some(Dna::A),
71//! 0b01 => Some(Dna::C),
72//! 0b10 => Some(Dna::G),
73//! 0b11 => Some(Dna::T),
74//! _ => None,
75//! }
76//! }
77//!
78//! fn unsafe_from_ascii(chr: u8) -> Self {
79//! if let Some(base) = Self::try_from_ascii(chr) {
80//! base
81//! } else {
82//! panic!("Unrecognised bit pattern!")
83//! }
84//! }
85//!
86//! fn try_from_ascii(chr: u8) -> Option<Self> {
87//! match chr {
88//! b'A' => Some(Dna::A),
89//! b'C' => Some(Dna::C),
90//! b'G' => Some(Dna::G),
91//! b'T' => Some(Dna::T),
92//! _ => None,
93//! }
94//! }
95//!
96//! fn to_char(self) -> char {
97//! match self {
98//! Dna::A => 'A',
99//! Dna::C => 'C',
100//! Dna::G => 'G',
101//! Dna::T => 'T',
102//! }
103//! }
104//!
105//! fn to_bits(self) -> u8 {
106//! self as u8
107//! }
108//!
109//! fn items() -> impl Iterator<Item = Self> {
110//! vec![Dna::A, Dna::C, Dna::G, Dna::T].into_iter()
111//! }
112//! }
113//!
114//! ```
115
116use core::fmt;
117use core::hash::Hash;
118
119pub mod amino;
120pub mod dna;
121pub mod iupac;
122
123#[cfg(feature = "extra_codecs")]
124pub mod masked;
125
126#[cfg(feature = "extra_codecs")]
127pub mod degenerate;
128
129pub mod text;
130
131pub use bio_seq_derive::Codec;
132
133/// The binary encoding of an alphabet's symbols can be represented with any type.
134/// Encoding from ASCII bytes and decoding the representation is implemented through
135/// the `Codec` trait.
136///
137/// The intended representation is an `Enum`, transparently represented as a `u8`.
138pub trait Codec: fmt::Debug + Copy + Clone + PartialEq + Hash + Eq {
139 /// The number of bits used to encode the symbols. e.g. `Dna::BITS` = 2, `Iupac::BITS` = 4.
140 const BITS: u8;
141
142 /// Convert raw bits of binary encoding into enum item. Binary values
143 /// that don't match an enum member's discriminant will result in panic or random enum
144 /// item
145 fn unsafe_from_bits(b: u8) -> Self;
146
147 /// Fallibly convert raw bits into enum. If the binary value does not
148 /// match a discriminant, return `None`
149 fn try_from_bits(b: u8) -> Option<Self>;
150
151 /// Encode an ASCII byte as a codec enum item
152 fn unsafe_from_ascii(c: u8) -> Self;
153
154 /// Fallibly encode an ASCII byte as a codec enum item
155 fn try_from_ascii(c: u8) -> Option<Self>;
156
157 /// Decode enum item as a UTF-8 character
158 fn to_char(self) -> char;
159
160 /// Encode as raw bits
161 fn to_bits(self) -> u8;
162
163 /// Iterator over the symbols of the codec
164 fn items() -> impl Iterator<Item = Self>;
165}
166
167#[cfg(test)]
168mod tests {
169 use super::dna::Dna;
170 use super::iupac::Iupac;
171
172 #[test]
173 fn dna_to_iupac() {
174 assert_eq!(Iupac::from(Dna::A), Iupac::A);
175 assert_eq!(Iupac::from(Dna::C), Iupac::C);
176 assert_eq!(Iupac::from(Dna::G), Iupac::G);
177 assert_eq!(Iupac::from(Dna::T), Iupac::T);
178
179 assert_ne!(Iupac::from(Dna::A), Iupac::T);
180 assert_ne!(Iupac::from(Dna::T), Iupac::A);
181 assert_ne!(Iupac::from(Dna::C), Iupac::T);
182 assert_ne!(Iupac::from(Dna::G), Iupac::T);
183 }
184}