bio/alphabets/
mod.rs

1// Copyright 2014-2015 Johannes Köster, Peer Aramillo Irizar.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Implementation of alphabets and useful utilities.
7//!
8//! # Example
9//!
10//! ```rust
11//! use bio::alphabets;
12//! let alphabet = alphabets::dna::alphabet();
13//! assert!(alphabet.is_word(b"AACCTgga"));
14//! assert!(!alphabet.is_word(b"AXYZ"));
15//! ```
16
17use std::borrow::Borrow;
18
19use bit_set::BitSet;
20use vec_map::VecMap;
21
22pub mod dna;
23pub mod protein;
24pub mod rna;
25
26pub type SymbolRanks = VecMap<u8>;
27
28/// Representation of an alphabet.
29#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
30pub struct Alphabet {
31    pub symbols: BitSet,
32}
33
34impl Alphabet {
35    /// Create new alphabet from given symbols.
36    ///
37    /// Complexity: O(n), where n is the number of symbols in the alphabet.
38    ///
39    /// # Example
40    ///
41    /// ```
42    /// use bio::alphabets;
43    ///
44    /// // Create an alphabet (note that a DNA alphabet is already available in bio::alphabets::dna).
45    /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
46    /// // Check whether a given text is a word over the alphabet.
47    /// assert!(dna_alphabet.is_word(b"GAttACA"));
48    /// ```
49    pub fn new<C, T>(symbols: T) -> Self
50    where
51        C: Borrow<u8>,
52        T: IntoIterator<Item = C>,
53    {
54        let mut s = BitSet::new();
55        s.extend(symbols.into_iter().map(|c| *c.borrow() as usize));
56
57        Alphabet { symbols: s }
58    }
59
60    /// Insert symbol into alphabet.
61    ///
62    /// Complexity: O(1)
63    ///
64    /// # Example
65    ///
66    /// ```
67    /// use bio::alphabets;
68    ///
69    /// let mut dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
70    /// assert!(!dna_alphabet.is_word(b"N"));
71    /// dna_alphabet.insert(78);
72    /// assert!(dna_alphabet.is_word(b"N"));
73    /// ```
74    pub fn insert(&mut self, a: u8) {
75        self.symbols.insert(a as usize);
76    }
77
78    /// Check if given text is a word over the alphabet.
79    ///
80    /// Complexity: O(n), where n is the length of the text.
81    ///
82    /// # Example
83    ///
84    /// ```
85    /// use bio::alphabets;
86    ///
87    /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
88    /// assert!(dna_alphabet.is_word(b"GAttACA"));
89    /// assert!(!dna_alphabet.is_word(b"42"));
90    /// ```
91    pub fn is_word<C, T>(&self, text: T) -> bool
92    where
93        C: Borrow<u8>,
94        T: IntoIterator<Item = C>,
95    {
96        text.into_iter()
97            .all(|c| self.symbols.contains(*c.borrow() as usize))
98    }
99
100    /// Return lexicographically maximal symbol.
101    ///
102    /// Complexity: O(n), where n is the number of symbols in the alphabet.
103    ///
104    /// # Example
105    ///
106    /// ```
107    /// use bio::alphabets;
108    ///
109    /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
110    /// assert_eq!(dna_alphabet.max_symbol(), Some(116)); // max symbol is "t"
111    /// let empty_alphabet = alphabets::Alphabet::new(b"");
112    /// assert_eq!(empty_alphabet.max_symbol(), None);
113    /// ```
114    pub fn max_symbol(&self) -> Option<u8> {
115        self.symbols.iter().max().map(|a| a as u8)
116    }
117
118    /// Return size of the alphabet.
119    ///
120    /// Upper and lower case representations of the same character
121    /// are counted as distinct characters.
122    ///
123    /// Complexity: O(n), where n is the number of symbols in the alphabet.
124    ///
125    /// # Example
126    ///
127    /// ```
128    /// use bio::alphabets;
129    ///
130    /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
131    /// assert_eq!(dna_alphabet.len(), 8);
132    /// ```
133    pub fn len(&self) -> usize {
134        self.symbols.len()
135    }
136
137    /// Is this alphabet empty?
138    ///
139    /// Complexity: O(n), where n is the number of symbols in the alphabet.
140    ///
141    /// # Example
142    ///
143    /// ```
144    /// use bio::alphabets;
145    ///
146    /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
147    /// assert!(!dna_alphabet.is_empty());
148    /// let empty_alphabet = alphabets::Alphabet::new(b"");
149    /// assert!(empty_alphabet.is_empty());
150    /// ```
151    pub fn is_empty(&self) -> bool {
152        self.symbols.is_empty()
153    }
154
155    /// Return a new alphabet taking the intersect between this and other.
156    ///
157    /// # Example
158    /// ```
159    /// use bio::alphabets;
160    ///
161    /// let alpha_a = alphabets::Alphabet::new(b"acgtACGT");
162    /// let alpha_b = alphabets::Alphabet::new(b"atcgMVP");
163    /// let intersect_alpha = alpha_a.intersection(&alpha_b);
164    ///
165    /// assert_eq!(intersect_alpha, alphabets::Alphabet::new(b"atcg"));
166    /// ```
167    pub fn intersection(&self, other: &Alphabet) -> Self {
168        return Alphabet {
169            symbols: self.symbols.intersection(&other.symbols).collect(),
170        };
171    }
172
173    /// Return a new alphabet taking the difference between this and other.
174    ///
175    /// # Example
176    /// ```
177    /// use bio::alphabets;
178    ///
179    /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
180    /// let dna_alphabet_upper = alphabets::Alphabet::new(b"ACGT");
181    /// let dna_lower = dna_alphabet.difference(&dna_alphabet_upper);
182    ///
183    /// assert_eq!(dna_lower, alphabets::Alphabet::new(b"atcg"));
184    /// ```
185    pub fn difference(&self, other: &Alphabet) -> Self {
186        return Alphabet {
187            symbols: self.symbols.difference(&other.symbols).collect(),
188        };
189    }
190
191    /// Return a new alphabet taking the union between this and other.
192    ///
193    /// # Example
194    /// ```
195    /// use bio::alphabets;
196    ///
197    /// let dna_alphabet = alphabets::Alphabet::new(b"ATCG");
198    /// let tokenize_alpha = alphabets::Alphabet::new(b"?|");
199    /// let alpha = dna_alphabet.union(&tokenize_alpha);
200    ///
201    /// assert_eq!(alpha, alphabets::Alphabet::new(b"ATCG?|"));
202    /// ```
203    pub fn union(&self, other: &Alphabet) -> Self {
204        return Alphabet {
205            symbols: self.symbols.union(&other.symbols).collect(),
206        };
207    }
208}
209
210/// Tools based on transforming the alphabet symbols to their lexicographical ranks.
211///
212/// Lexicographical rank is computed using `u8` representations,
213/// i.e. ASCII codes, of the input characters.
214/// For example, assuming that the alphabet consists of the symbols `A`, `C`, `G`, and `T`, this
215/// will yield ranks `0`, `1`, `2`, `3` for them, respectively.
216///
217/// `RankTransform` can be used in to perform bit encoding for texts over a
218/// given alphabet via `bio::data_structures::bitenc`.
219#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
220pub struct RankTransform {
221    pub ranks: SymbolRanks,
222}
223
224impl RankTransform {
225    /// Construct a new `RankTransform`.
226    ///
227    /// Complexity: O(n), where n is the number of symbols in the alphabet.
228    ///
229    /// # Example
230    ///
231    /// ```
232    /// use bio::alphabets;
233    ///
234    /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
235    /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
236    /// ```
237    pub fn new(alphabet: &Alphabet) -> Self {
238        let mut ranks = VecMap::new();
239        for (r, c) in alphabet.symbols.iter().enumerate() {
240            ranks.insert(c, r as u8);
241        }
242
243        RankTransform { ranks }
244    }
245
246    /// Get the rank of symbol `a`.
247    ///
248    /// This method panics for characters not contained in the alphabet.
249    ///
250    /// Complexity: O(1)
251    ///
252    /// # Example
253    ///
254    /// ```
255    /// use bio::alphabets;
256    ///
257    /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
258    /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
259    /// assert_eq!(dna_ranks.get(65), 0); // "A"
260    /// assert_eq!(dna_ranks.get(116), 7); // "t"
261    /// ```
262    #[inline]
263    pub fn get(&self, a: u8) -> u8 {
264        *self.ranks.get(a as usize).expect("Unexpected character.")
265    }
266
267    /// Transform a given `text` into a vector of rank values.
268    ///
269    /// Complexity: O(n), where n is the length of the text.
270    ///
271    /// # Example
272    ///
273    /// ```
274    /// use bio::alphabets;
275    ///
276    /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
277    /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
278    /// let text = b"aAcCgGtT";
279    /// assert_eq!(dna_ranks.transform(text), vec![4, 0, 5, 1, 6, 2, 7, 3]);
280    /// ```
281    pub fn transform<C, T>(&self, text: T) -> Vec<u8>
282    where
283        C: Borrow<u8>,
284        T: IntoIterator<Item = C>,
285    {
286        text.into_iter()
287            .map(|c| {
288                *self
289                    .ranks
290                    .get(*c.borrow() as usize)
291                    .expect("Unexpected character in text.")
292            })
293            .collect()
294    }
295
296    /// Iterate over q-grams (substrings of length q) of given `text`. The q-grams are encoded
297    /// as `usize` by storing the symbol ranks in log2(|A|) bits (with |A| being the alphabet size).
298    ///
299    /// If q is larger than usize::BITS / log2(|A|), this method fails with an assertion.
300    ///
301    /// Complexity: O(n), where n is the length of the text.
302    ///
303    /// # Example
304    ///
305    /// ```
306    /// use bio::alphabets;
307    ///
308    /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
309    /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
310    ///
311    /// let q_grams: Vec<usize> = dna_ranks.qgrams(2, b"ACGT").collect();
312    /// assert_eq!(q_grams, vec![1, 10, 19]);
313    /// ```
314    pub fn qgrams<C, T>(&self, q: u32, text: T) -> QGrams<'_, C, T::IntoIter>
315    where
316        C: Borrow<u8>,
317        T: IntoIterator<Item = C>,
318    {
319        assert!(q > 0, "Expecting q-gram length q to be larger than 0.");
320        let bits = (self.ranks.len() as f32).log2().ceil() as u32;
321        assert!(
322            bits * q <= usize::BITS,
323            "Expecting q to be smaller than usize / log2(|A|)"
324        );
325
326        let mut qgrams = QGrams {
327            text: text.into_iter(),
328            ranks: self,
329            q,
330            bits,
331            mask: 1usize.checked_shl(q * bits).unwrap_or(0).wrapping_sub(1),
332            qgram: 0,
333        };
334
335        for _ in 0..q - 1 {
336            qgrams.next();
337        }
338
339        qgrams
340    }
341
342    /// Iterate over q-grams (substrings of length q) of given `text` in reverse. The q-grams are encoded
343    /// as `usize` by storing the symbol ranks in log2(|A|) bits (with |A| being the alphabet size).
344    ///
345    /// If q is larger than usize::BITS / log2(|A|), this method fails with an assertion.
346    ///
347    /// Complexity: O(n), where n is the length of the text.
348    ///
349    /// # Example
350    ///
351    /// ```
352    /// use bio::alphabets;
353    ///
354    /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
355    /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
356    ///
357    /// let q_grams: Vec<usize> = dna_ranks.rev_qgrams(2, b"ACGT").collect();
358    /// assert_eq!(q_grams, vec![19, 10, 1]);
359    /// ```
360    pub fn rev_qgrams<C, IT, T>(&self, q: u32, text: IT) -> RevQGrams<'_, C, T>
361    where
362        C: Borrow<u8>,
363        T: DoubleEndedIterator<Item = C>,
364        IT: IntoIterator<IntoIter = T>,
365    {
366        assert!(q > 0, "Expecting q-gram length q to be larger than 0.");
367        let bits = (self.ranks.len() as f32).log2().ceil() as u32;
368        assert!(
369            bits * q <= usize::BITS,
370            "Expecting q to be smaller than usize / log2(|A|)"
371        );
372
373        let mut rev_qgrams = RevQGrams {
374            text: text.into_iter(),
375            ranks: self,
376            q,
377            bits,
378            left_shift: (q - 1) * bits,
379            qgram: 0,
380        };
381
382        for _ in 0..q - 1 {
383            rev_qgrams.next();
384        }
385
386        rev_qgrams
387    }
388
389    /// Restore alphabet from transform.
390    ///
391    /// Complexity: O(n), where n is the number of symbols in the alphabet.
392    ///
393    /// # Example
394    ///
395    /// ```
396    /// use bio::alphabets;
397    ///
398    /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
399    /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
400    /// assert_eq!(dna_ranks.alphabet().symbols, dna_alphabet.symbols);
401    /// ```
402    pub fn alphabet(&self) -> Alphabet {
403        let mut symbols = BitSet::with_capacity(self.ranks.len());
404        symbols.extend(self.ranks.keys());
405        Alphabet { symbols }
406    }
407
408    /// Compute the number of bits required to encode the largest rank value.
409    ///
410    /// For example, the alphabet `b"ACGT"` with 4 symbols has the maximal rank
411    /// 3, which can be encoded in 2 bits.
412    ///
413    /// This value can be used to create a `data_structures::bitenc::BitEnc`
414    /// bit encoding tailored to the given alphabet.
415    ///
416    /// Complexity: O(n), where n is the number of symbols in the alphabet.
417    ///
418    /// # Example
419    ///
420    /// ```
421    /// use bio::alphabets;
422    ///
423    /// let dna_alphabet = alphabets::Alphabet::new(b"ACGT");
424    /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
425    /// assert_eq!(dna_ranks.get_width(), 2);
426    /// let dna_n_alphabet = alphabets::Alphabet::new(b"ACGTN");
427    /// let dna_n_ranks = alphabets::RankTransform::new(&dna_n_alphabet);
428    /// assert_eq!(dna_n_ranks.get_width(), 3);
429    /// ```
430    pub fn get_width(&self) -> usize {
431        (self.ranks.len() as f32).log2().ceil() as usize
432    }
433}
434
435/// Iterator over q-grams.
436#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize)]
437pub struct QGrams<'a, C, T>
438where
439    C: Borrow<u8>,
440    T: Iterator<Item = C>,
441{
442    text: T,
443    ranks: &'a RankTransform,
444    q: u32,
445    bits: u32,
446    mask: usize,
447    qgram: usize,
448}
449
450impl<'a, C, T> QGrams<'a, C, T>
451where
452    C: Borrow<u8>,
453    T: Iterator<Item = C>,
454{
455    /// Push a new character into the current qgram.
456    #[inline]
457    fn qgram_push(&mut self, a: u8) {
458        self.qgram <<= self.bits;
459        self.qgram |= a as usize;
460        self.qgram &= self.mask;
461    }
462}
463
464impl<'a, C, T> Iterator for QGrams<'a, C, T>
465where
466    C: Borrow<u8>,
467    T: Iterator<Item = C>,
468{
469    type Item = usize;
470
471    #[inline]
472    fn next(&mut self) -> Option<usize> {
473        match self.text.next() {
474            Some(a) => {
475                let b = self.ranks.get(*a.borrow());
476                self.qgram_push(b);
477                Some(self.qgram)
478            }
479            None => None,
480        }
481    }
482
483    fn size_hint(&self) -> (usize, Option<usize>) {
484        self.text.size_hint()
485    }
486}
487
488impl<'a, C, T> ExactSizeIterator for QGrams<'a, C, T>
489where
490    C: Borrow<u8>,
491    T: ExactSizeIterator<Item = C>,
492{
493}
494
495/// Iterator over q-grams.
496#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize)]
497pub struct RevQGrams<'a, C, T>
498where
499    C: Borrow<u8>,
500    T: DoubleEndedIterator<Item = C>,
501{
502    text: T,
503    ranks: &'a RankTransform,
504    q: u32,
505    bits: u32,
506    left_shift: u32,
507    qgram: usize,
508}
509
510impl<'a, C, T> RevQGrams<'a, C, T>
511where
512    C: Borrow<u8>,
513    T: DoubleEndedIterator<Item = C>,
514{
515    /// Push a new character into the current qgram in the reverse direction.
516    #[inline]
517    fn qgram_push_rev(&mut self, a: u8) {
518        self.qgram >>= self.bits;
519        self.qgram |= (a as usize) << self.left_shift;
520    }
521}
522
523impl<'a, C, T> Iterator for RevQGrams<'a, C, T>
524where
525    C: Borrow<u8>,
526    T: DoubleEndedIterator<Item = C>,
527{
528    type Item = usize;
529
530    #[inline]
531    fn next(&mut self) -> Option<usize> {
532        match self.text.next_back() {
533            Some(a) => {
534                let b = self.ranks.get(*a.borrow());
535                self.qgram_push_rev(b);
536                Some(self.qgram)
537            }
538            None => None,
539        }
540    }
541
542    fn size_hint(&self) -> (usize, Option<usize>) {
543        self.text.size_hint()
544    }
545}
546
547impl<'a, C, T> ExactSizeIterator for RevQGrams<'a, C, T>
548where
549    C: Borrow<u8>,
550    T: DoubleEndedIterator<Item = C> + ExactSizeIterator<Item = C>,
551{
552}
553
554/// Returns the english ascii lower case alphabet.
555pub fn english_ascii_lower_alphabet() -> Alphabet {
556    Alphabet::new(&b"abcdefghijklmnopqrstuvwxyz"[..])
557}
558
559/// Returns the english ascii upper case alphabet.
560pub fn english_ascii_upper_alphabet() -> Alphabet {
561    Alphabet::new(&b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"[..])
562}
563
564#[cfg(test)]
565mod tests {
566    use super::*;
567
568    #[test]
569    fn test_alphabet_eq() {
570        assert_eq!(Alphabet::new(b"ATCG"), Alphabet::new(b"ATCG"));
571        assert_eq!(Alphabet::new(b"ATCG"), Alphabet::new(b"TAGC"));
572        assert_ne!(Alphabet::new(b"ATCG"), Alphabet::new(b"ATC"));
573    }
574
575    /// When `q * bits == usize::BITS`, make sure that `1<<(q*bits)` does not overflow.
576    #[test]
577    fn test_qgram_shiftleft_overflow() {
578        let alphabet = Alphabet::new(b"ACTG");
579        let transform = RankTransform::new(&alphabet);
580        let text = b"ACTG".repeat(100);
581        transform.qgrams(usize::BITS / 2, text);
582    }
583
584    #[test]
585    fn test_qgrams() {
586        let alphabet = Alphabet::new(b"ACTG");
587        let transform = RankTransform::new(&alphabet);
588        let text = b"ACTGA";
589        let mut qgrams = transform.qgrams(4, text);
590        assert_eq!(qgrams.next(), transform.qgrams(4, b"ACTG").next());
591        assert_eq!(qgrams.next(), transform.qgrams(4, b"CTGA").next());
592        assert_eq!(qgrams.next(), None);
593    }
594
595    #[test]
596    fn test_rev_qgrams() {
597        let alphabet = Alphabet::new(b"ACTG");
598        let transform = RankTransform::new(&alphabet);
599        let text = b"ACTGA";
600        let mut rev_qgrams = transform.rev_qgrams(4, text);
601        assert_eq!(rev_qgrams.next(), transform.qgrams(4, b"CTGA").next());
602        assert_eq!(rev_qgrams.next(), transform.qgrams(4, b"ACTG").next());
603        assert_eq!(rev_qgrams.next(), None);
604    }
605
606    #[test]
607    fn test_exactsize_iterator() {
608        let alphabet = Alphabet::new(b"ACTG");
609        let transform = RankTransform::new(&alphabet);
610        let text = b"ACTGACTG";
611
612        let mut qgrams = transform.qgrams(4, text);
613        assert_eq!(qgrams.len(), 5);
614        qgrams.next();
615        assert_eq!(qgrams.len(), 4);
616
617        let mut rev_qgrams = transform.rev_qgrams(4, text);
618        assert_eq!(rev_qgrams.len(), 5);
619        rev_qgrams.next();
620        assert_eq!(rev_qgrams.len(), 4);
621
622        let qgrams = transform.qgrams(4, b"AC");
623        assert_eq!(qgrams.len(), 0);
624        let rev_qgrams = transform.rev_qgrams(4, b"AC");
625        assert_eq!(rev_qgrams.len(), 0);
626    }
627}