Skip to main content

use_alphabet/
lib.rs

1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4use core::{fmt, str::FromStr};
5use std::{collections::BTreeSet, error::Error};
6
7/// Error returned by alphabet constructors.
8#[derive(Clone, Copy, Debug, Eq, PartialEq)]
9pub enum AlphabetError {
10    /// The supplied symbol set was empty.
11    EmptySymbolSet,
12}
13
14impl fmt::Display for AlphabetError {
15    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
16        match self {
17            Self::EmptySymbolSet => formatter.write_str("alphabet symbol set cannot be empty"),
18        }
19    }
20}
21
22impl Error for AlphabetError {}
23
24/// A descriptive kind for a biological alphabet.
25#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
26pub enum AlphabetKind {
27    /// DNA alphabet.
28    Dna,
29    /// RNA alphabet.
30    Rna,
31    /// Protein alphabet.
32    Protein,
33    /// DNA alphabet including ambiguity symbols.
34    DnaWithAmbiguity,
35    /// RNA alphabet including ambiguity symbols.
36    RnaWithAmbiguity,
37    /// Protein alphabet including ambiguity symbols.
38    ProteinWithAmbiguity,
39    /// Domain-specific alphabet.
40    Custom(String),
41}
42
43impl fmt::Display for AlphabetKind {
44    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45        match self {
46            Self::Dna => formatter.write_str("dna"),
47            Self::Rna => formatter.write_str("rna"),
48            Self::Protein => formatter.write_str("protein"),
49            Self::DnaWithAmbiguity => formatter.write_str("dna-with-ambiguity"),
50            Self::RnaWithAmbiguity => formatter.write_str("rna-with-ambiguity"),
51            Self::ProteinWithAmbiguity => formatter.write_str("protein-with-ambiguity"),
52            Self::Custom(kind) => formatter.write_str(kind),
53        }
54    }
55}
56
57impl FromStr for AlphabetKind {
58    type Err = core::convert::Infallible;
59
60    fn from_str(value: &str) -> Result<Self, Self::Err> {
61        let kind = match value.trim().to_ascii_lowercase().as_str() {
62            "dna" => Self::Dna,
63            "rna" => Self::Rna,
64            "protein" => Self::Protein,
65            "dna-with-ambiguity" | "dna_with_ambiguity" => Self::DnaWithAmbiguity,
66            "rna-with-ambiguity" | "rna_with_ambiguity" => Self::RnaWithAmbiguity,
67            "protein-with-ambiguity" | "protein_with_ambiguity" => Self::ProteinWithAmbiguity,
68            _ => Self::Custom(value.to_string()),
69        };
70
71        Ok(kind)
72    }
73}
74
75/// Deterministic set of biological alphabet symbols.
76#[derive(Clone, Debug, Eq, PartialEq)]
77pub struct AlphabetSymbolSet {
78    symbols: BTreeSet<char>,
79}
80
81impl AlphabetSymbolSet {
82    /// Creates a symbol set from characters.
83    ///
84    /// # Errors
85    ///
86    /// Returns [`AlphabetError::EmptySymbolSet`] when no symbols are supplied.
87    pub fn new(symbols: impl IntoIterator<Item = char>) -> Result<Self, AlphabetError> {
88        let symbols = symbols.into_iter().collect::<BTreeSet<_>>();
89
90        if symbols.is_empty() {
91            Err(AlphabetError::EmptySymbolSet)
92        } else {
93            Ok(Self { symbols })
94        }
95    }
96
97    /// Creates a symbol set from string characters.
98    ///
99    /// # Errors
100    ///
101    /// Returns [`AlphabetError::EmptySymbolSet`] when the string is empty.
102    pub fn from_symbols(symbols: impl AsRef<str>) -> Result<Self, AlphabetError> {
103        Self::new(symbols.as_ref().chars())
104    }
105
106    /// Returns true when the symbol is present.
107    #[must_use]
108    pub fn contains(&self, symbol: char) -> bool {
109        self.symbols.contains(&symbol)
110    }
111
112    /// Returns the number of distinct symbols.
113    #[must_use]
114    pub fn len(&self) -> usize {
115        self.symbols.len()
116    }
117
118    /// Returns true when the set contains no symbols.
119    #[must_use]
120    pub fn is_empty(&self) -> bool {
121        self.symbols.is_empty()
122    }
123
124    /// Iterates symbols in deterministic order.
125    pub fn iter(&self) -> impl Iterator<Item = &char> {
126        self.symbols.iter()
127    }
128}
129
130/// A biological alphabet with a descriptive kind and symbol set.
131#[derive(Clone, Debug, Eq, PartialEq)]
132pub struct BioAlphabet {
133    kind: AlphabetKind,
134    symbols: AlphabetSymbolSet,
135}
136
137impl BioAlphabet {
138    /// Creates an alphabet from a kind and symbol set.
139    #[must_use]
140    pub const fn new(kind: AlphabetKind, symbols: AlphabetSymbolSet) -> Self {
141        Self { kind, symbols }
142    }
143
144    /// Returns the simple DNA alphabet `A`, `C`, `G`, `T`.
145    #[must_use]
146    pub fn dna() -> Self {
147        Self::from_static(AlphabetKind::Dna, "ACGT")
148    }
149
150    /// Returns the simple RNA alphabet `A`, `C`, `G`, `U`.
151    #[must_use]
152    pub fn rna() -> Self {
153        Self::from_static(AlphabetKind::Rna, "ACGU")
154    }
155
156    /// Returns the common protein alphabet.
157    #[must_use]
158    pub fn protein() -> Self {
159        Self::from_static(AlphabetKind::Protein, "ACDEFGHIKLMNPQRSTVWY")
160    }
161
162    /// Returns a DNA alphabet including common ambiguity symbols.
163    #[must_use]
164    pub fn dna_with_ambiguity() -> Self {
165        Self::from_static(AlphabetKind::DnaWithAmbiguity, "ACGTRYSWKMBDHVN")
166    }
167
168    /// Returns an RNA alphabet including common ambiguity symbols.
169    #[must_use]
170    pub fn rna_with_ambiguity() -> Self {
171        Self::from_static(AlphabetKind::RnaWithAmbiguity, "ACGURYSWKMBDHVN")
172    }
173
174    /// Returns a protein alphabet including common ambiguity symbols.
175    #[must_use]
176    pub fn protein_with_ambiguity() -> Self {
177        Self::from_static(
178            AlphabetKind::ProteinWithAmbiguity,
179            "ABCDEFGHIKLMNPQRSTVWXYZ*",
180        )
181    }
182
183    /// Creates a custom alphabet.
184    ///
185    /// # Errors
186    ///
187    /// Returns [`AlphabetError::EmptySymbolSet`] when no symbols are supplied.
188    pub fn custom(
189        kind: impl Into<String>,
190        symbols: impl AsRef<str>,
191    ) -> Result<Self, AlphabetError> {
192        Ok(Self::new(
193            AlphabetKind::Custom(kind.into()),
194            AlphabetSymbolSet::from_symbols(symbols)?,
195        ))
196    }
197
198    /// Returns the descriptive alphabet kind.
199    #[must_use]
200    pub const fn kind(&self) -> &AlphabetKind {
201        &self.kind
202    }
203
204    /// Returns the alphabet symbols.
205    #[must_use]
206    pub const fn symbols(&self) -> &AlphabetSymbolSet {
207        &self.symbols
208    }
209
210    /// Returns true when the symbol is present in the alphabet.
211    #[must_use]
212    pub fn contains(&self, symbol: char) -> bool {
213        self.symbols.contains(symbol)
214    }
215
216    /// Returns true when every character in the supplied text is present in the alphabet.
217    #[must_use]
218    pub fn contains_all(&self, text: impl AsRef<str>) -> bool {
219        text.as_ref().chars().all(|symbol| self.contains(symbol))
220    }
221
222    fn from_static(kind: AlphabetKind, symbols: &str) -> Self {
223        let symbols = AlphabetSymbolSet {
224            symbols: symbols.chars().collect(),
225        };
226        Self::new(kind, symbols)
227    }
228}
229
230#[cfg(test)]
231mod tests {
232    use super::{AlphabetError, AlphabetKind, AlphabetSymbolSet, BioAlphabet};
233    use core::str::FromStr;
234
235    #[test]
236    fn dna_alphabet_contains_standard_symbols() {
237        let dna = BioAlphabet::dna();
238
239        assert!(dna.contains_all("ACGT"));
240        assert_eq!(dna.kind(), &AlphabetKind::Dna);
241    }
242
243    #[test]
244    fn rna_alphabet_contains_standard_symbols() {
245        let rna = BioAlphabet::rna();
246
247        assert!(rna.contains_all("ACGU"));
248        assert_eq!(rna.kind(), &AlphabetKind::Rna);
249    }
250
251    #[test]
252    fn protein_alphabet_contains_common_symbols() {
253        let protein = BioAlphabet::protein();
254
255        assert!(protein.contains_all("ACDEFGHIKLMNPQRSTVWY"));
256    }
257
258    #[test]
259    fn invalid_symbol_is_rejected_by_membership_check() {
260        let dna = BioAlphabet::dna();
261
262        assert!(!dna.contains('U'));
263        assert!(!dna.contains_all("ACGU"));
264    }
265
266    #[test]
267    fn constructs_custom_alphabet() {
268        let alphabet = BioAlphabet::custom("toy", "ABC").expect("valid alphabet");
269
270        assert_eq!(alphabet.kind(), &AlphabetKind::Custom("toy".into()));
271        assert!(alphabet.contains_all("CBA"));
272    }
273
274    #[test]
275    fn rejects_empty_symbol_set() {
276        assert_eq!(
277            AlphabetSymbolSet::from_symbols(""),
278            Err(AlphabetError::EmptySymbolSet)
279        );
280    }
281
282    #[test]
283    fn alphabet_kind_displays_and_parses() {
284        assert_eq!(
285            AlphabetKind::DnaWithAmbiguity.to_string(),
286            "dna-with-ambiguity"
287        );
288        assert_eq!(AlphabetKind::from_str("protein"), Ok(AlphabetKind::Protein));
289    }
290}