bio_seq/
codec.rs

1//! Coding/Decoding trait for bit-packable enums representing sets of genomic symbols
2//!
3//! The [dna], [iupac], [text], and [amino] alphabets are built in.
4//!
5//! This trait implements the translation between the UTF-8 representation of an alphabet and its efficient bit-packing.
6//! The `BITS` attribute stores the number of bits used by the representation.
7//! ```
8//! use bio_seq::prelude::{Dna, Codec};
9//! use bio_seq::codec::text;
10//! assert_eq!(Dna::BITS, 2);
11//! assert_eq!(text::Dna::BITS, 8);
12//! ```
13//!
14//! ## Deriving custom Codecs
15//!
16//! Custom encodings can be easily defined on enums using the derivable `Codec` trait.
17//!
18//! ```ignore
19//! use bio_seq::prelude;
20//! use bio_seq::prelude::Codec;
21//!
22//! #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Codec)]
23//! pub enum Dna {
24//!     A = 0b00,
25//!     C = 0b01,
26//!     G = 0b10,
27//!     T = 0b11,
28//! }
29//! ```
30//! ## Implementing custom Codecs
31//!
32//! Custom encodings can be defined on enums by implementing the `Codec` trait.
33//!
34//! ```
35//! use bio_seq::prelude;
36//! use bio_seq::prelude::Codec;
37//!
38//! #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
39//! pub enum Dna {
40//!     A = 0b00,
41//!     C = 0b01,
42//!     G = 0b10,
43//!     T = 0b11,
44//! }
45//!
46//! impl From<Dna> for u8 {
47//!    fn from(base: Dna) -> u8 {
48//!         match base {
49//!             Dna::A => 0b00,
50//!             Dna::C => 0b01,
51//!             Dna::G => 0b10,
52//!             Dna::T => 0b11,
53//!         }
54//!    }
55//! }
56//!
57//! impl Codec for Dna {
58//!     const BITS: u8 = 2;
59//!
60//!     fn unsafe_from_bits(bits: u8) -> Self {
61//!         if let Some(base) = Self::try_from_bits(bits) {
62//!             base
63//!         } else {
64//!             panic!("Unrecognised bit pattern!")
65//!         }
66//!     }
67//!
68//!     fn try_from_bits(bits: u8) -> Option<Self> {
69//!         match bits {
70//!             0b00 => Some(Dna::A),
71//!             0b01 => Some(Dna::C),
72//!             0b10 => Some(Dna::G),
73//!             0b11 => Some(Dna::T),
74//!             _ => None,
75//!         }
76//!     }
77//!
78//!     fn unsafe_from_ascii(chr: u8) -> Self {
79//!         if let Some(base) = Self::try_from_ascii(chr) {
80//!             base
81//!         } else {
82//!             panic!("Unrecognised bit pattern!")
83//!         }
84//!     }
85//!
86//!     fn try_from_ascii(chr: u8) -> Option<Self> {
87//!         match chr {
88//!             b'A' => Some(Dna::A),
89//!             b'C' => Some(Dna::C),
90//!             b'G' => Some(Dna::G),
91//!             b'T' => Some(Dna::T),
92//!             _ => None,
93//!         }
94//!     }
95//!
96//!     fn to_char(self) -> char {
97//!         match self {
98//!             Dna::A => 'A',
99//!             Dna::C => 'C',
100//!             Dna::G => 'G',
101//!             Dna::T => 'T',
102//!         }
103//!     }
104//!
105//!     fn to_bits(self) -> u8 {
106//!         self as u8
107//!     }
108//!
109//!     fn items() -> impl Iterator<Item = Self> {
110//!         vec![Dna::A, Dna::C, Dna::G, Dna::T].into_iter()
111//!     }
112//! }
113//!
114//! ```
115
116use core::fmt;
117use core::hash::Hash;
118
119pub mod amino;
120pub mod dna;
121pub mod iupac;
122
123#[cfg(feature = "extra_codecs")]
124pub mod masked;
125
126#[cfg(feature = "extra_codecs")]
127pub mod degenerate;
128
129pub mod text;
130
131pub use bio_seq_derive::Codec;
132
133/// The binary encoding of an alphabet's symbols can be represented with any type.
134/// Encoding from ASCII bytes and decoding the representation is implemented through
135/// the `Codec` trait.  
136///
137/// The intended representation is an `Enum`, transparently represented as a `u8`.
138pub trait Codec: fmt::Debug + Copy + Clone + PartialEq + Hash + Eq {
139    /// The number of bits used to encode the symbols. e.g. `Dna::BITS` = 2, `Iupac::BITS` = 4.
140    const BITS: u8;
141
142    /// Convert raw bits of binary encoding into enum item. Binary values
143    /// that don't match an enum member's discriminant will result in panic or random enum
144    /// item
145    fn unsafe_from_bits(b: u8) -> Self;
146
147    /// Fallibly convert raw bits into enum. If the binary value does not
148    /// match a discriminant, return `None`
149    fn try_from_bits(b: u8) -> Option<Self>;
150
151    /// Encode an ASCII byte as a codec enum item
152    fn unsafe_from_ascii(c: u8) -> Self;
153
154    /// Fallibly encode an ASCII byte as a codec enum item
155    fn try_from_ascii(c: u8) -> Option<Self>;
156
157    /// Decode enum item as a UTF-8 character
158    fn to_char(self) -> char;
159
160    /// Encode as raw bits
161    fn to_bits(self) -> u8;
162
163    /// Iterator over the symbols of the codec
164    fn items() -> impl Iterator<Item = Self>;
165}
166
167#[cfg(test)]
168mod tests {
169    use super::dna::Dna;
170    use super::iupac::Iupac;
171
172    #[test]
173    fn dna_to_iupac() {
174        assert_eq!(Iupac::from(Dna::A), Iupac::A);
175        assert_eq!(Iupac::from(Dna::C), Iupac::C);
176        assert_eq!(Iupac::from(Dna::G), Iupac::G);
177        assert_eq!(Iupac::from(Dna::T), Iupac::T);
178
179        assert_ne!(Iupac::from(Dna::A), Iupac::T);
180        assert_ne!(Iupac::from(Dna::T), Iupac::A);
181        assert_ne!(Iupac::from(Dna::C), Iupac::T);
182        assert_ne!(Iupac::from(Dna::G), Iupac::T);
183    }
184}