bio/alphabets/mod.rs
1// Copyright 2014-2015 Johannes Köster, Peer Aramillo Irizar.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Implementation of alphabets and useful utilities.
7//!
8//! # Example
9//!
10//! ```rust
11//! use bio::alphabets;
12//! let alphabet = alphabets::dna::alphabet();
13//! assert!(alphabet.is_word(b"AACCTgga"));
14//! assert!(!alphabet.is_word(b"AXYZ"));
15//! ```
16
17use std::borrow::Borrow;
18
19use bit_set::BitSet;
20use vec_map::VecMap;
21
22pub mod dna;
23pub mod protein;
24pub mod rna;
25
26pub type SymbolRanks = VecMap<u8>;
27
28/// Representation of an alphabet.
29#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
30pub struct Alphabet {
31 pub symbols: BitSet,
32}
33
34impl Alphabet {
35 /// Create new alphabet from given symbols.
36 ///
37 /// Complexity: O(n), where n is the number of symbols in the alphabet.
38 ///
39 /// # Example
40 ///
41 /// ```
42 /// use bio::alphabets;
43 ///
44 /// // Create an alphabet (note that a DNA alphabet is already available in bio::alphabets::dna).
45 /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
46 /// // Check whether a given text is a word over the alphabet.
47 /// assert!(dna_alphabet.is_word(b"GAttACA"));
48 /// ```
49 pub fn new<C, T>(symbols: T) -> Self
50 where
51 C: Borrow<u8>,
52 T: IntoIterator<Item = C>,
53 {
54 let mut s = BitSet::new();
55 s.extend(symbols.into_iter().map(|c| *c.borrow() as usize));
56
57 Alphabet { symbols: s }
58 }
59
60 /// Insert symbol into alphabet.
61 ///
62 /// Complexity: O(1)
63 ///
64 /// # Example
65 ///
66 /// ```
67 /// use bio::alphabets;
68 ///
69 /// let mut dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
70 /// assert!(!dna_alphabet.is_word(b"N"));
71 /// dna_alphabet.insert(78);
72 /// assert!(dna_alphabet.is_word(b"N"));
73 /// ```
74 pub fn insert(&mut self, a: u8) {
75 self.symbols.insert(a as usize);
76 }
77
78 /// Check if given text is a word over the alphabet.
79 ///
80 /// Complexity: O(n), where n is the length of the text.
81 ///
82 /// # Example
83 ///
84 /// ```
85 /// use bio::alphabets;
86 ///
87 /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
88 /// assert!(dna_alphabet.is_word(b"GAttACA"));
89 /// assert!(!dna_alphabet.is_word(b"42"));
90 /// ```
91 pub fn is_word<C, T>(&self, text: T) -> bool
92 where
93 C: Borrow<u8>,
94 T: IntoIterator<Item = C>,
95 {
96 text.into_iter()
97 .all(|c| self.symbols.contains(*c.borrow() as usize))
98 }
99
100 /// Return lexicographically maximal symbol.
101 ///
102 /// Complexity: O(n), where n is the number of symbols in the alphabet.
103 ///
104 /// # Example
105 ///
106 /// ```
107 /// use bio::alphabets;
108 ///
109 /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
110 /// assert_eq!(dna_alphabet.max_symbol(), Some(116)); // max symbol is "t"
111 /// let empty_alphabet = alphabets::Alphabet::new(b"");
112 /// assert_eq!(empty_alphabet.max_symbol(), None);
113 /// ```
114 pub fn max_symbol(&self) -> Option<u8> {
115 self.symbols.iter().max().map(|a| a as u8)
116 }
117
118 /// Return size of the alphabet.
119 ///
120 /// Upper and lower case representations of the same character
121 /// are counted as distinct characters.
122 ///
123 /// Complexity: O(n), where n is the number of symbols in the alphabet.
124 ///
125 /// # Example
126 ///
127 /// ```
128 /// use bio::alphabets;
129 ///
130 /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
131 /// assert_eq!(dna_alphabet.len(), 8);
132 /// ```
133 pub fn len(&self) -> usize {
134 self.symbols.len()
135 }
136
137 /// Is this alphabet empty?
138 ///
139 /// Complexity: O(n), where n is the number of symbols in the alphabet.
140 ///
141 /// # Example
142 ///
143 /// ```
144 /// use bio::alphabets;
145 ///
146 /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
147 /// assert!(!dna_alphabet.is_empty());
148 /// let empty_alphabet = alphabets::Alphabet::new(b"");
149 /// assert!(empty_alphabet.is_empty());
150 /// ```
151 pub fn is_empty(&self) -> bool {
152 self.symbols.is_empty()
153 }
154
155 /// Return a new alphabet taking the intersect between this and other.
156 ///
157 /// # Example
158 /// ```
159 /// use bio::alphabets;
160 ///
161 /// let alpha_a = alphabets::Alphabet::new(b"acgtACGT");
162 /// let alpha_b = alphabets::Alphabet::new(b"atcgMVP");
163 /// let intersect_alpha = alpha_a.intersection(&alpha_b);
164 ///
165 /// assert_eq!(intersect_alpha, alphabets::Alphabet::new(b"atcg"));
166 /// ```
167 pub fn intersection(&self, other: &Alphabet) -> Self {
168 return Alphabet {
169 symbols: self.symbols.intersection(&other.symbols).collect(),
170 };
171 }
172
173 /// Return a new alphabet taking the difference between this and other.
174 ///
175 /// # Example
176 /// ```
177 /// use bio::alphabets;
178 ///
179 /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
180 /// let dna_alphabet_upper = alphabets::Alphabet::new(b"ACGT");
181 /// let dna_lower = dna_alphabet.difference(&dna_alphabet_upper);
182 ///
183 /// assert_eq!(dna_lower, alphabets::Alphabet::new(b"atcg"));
184 /// ```
185 pub fn difference(&self, other: &Alphabet) -> Self {
186 return Alphabet {
187 symbols: self.symbols.difference(&other.symbols).collect(),
188 };
189 }
190
191 /// Return a new alphabet taking the union between this and other.
192 ///
193 /// # Example
194 /// ```
195 /// use bio::alphabets;
196 ///
197 /// let dna_alphabet = alphabets::Alphabet::new(b"ATCG");
198 /// let tokenize_alpha = alphabets::Alphabet::new(b"?|");
199 /// let alpha = dna_alphabet.union(&tokenize_alpha);
200 ///
201 /// assert_eq!(alpha, alphabets::Alphabet::new(b"ATCG?|"));
202 /// ```
203 pub fn union(&self, other: &Alphabet) -> Self {
204 return Alphabet {
205 symbols: self.symbols.union(&other.symbols).collect(),
206 };
207 }
208}
209
210/// Tools based on transforming the alphabet symbols to their lexicographical ranks.
211///
212/// Lexicographical rank is computed using `u8` representations,
213/// i.e. ASCII codes, of the input characters.
214/// For example, assuming that the alphabet consists of the symbols `A`, `C`, `G`, and `T`, this
215/// will yield ranks `0`, `1`, `2`, `3` for them, respectively.
216///
217/// `RankTransform` can be used in to perform bit encoding for texts over a
218/// given alphabet via `bio::data_structures::bitenc`.
219#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
220pub struct RankTransform {
221 pub ranks: SymbolRanks,
222}
223
224impl RankTransform {
225 /// Construct a new `RankTransform`.
226 ///
227 /// Complexity: O(n), where n is the number of symbols in the alphabet.
228 ///
229 /// # Example
230 ///
231 /// ```
232 /// use bio::alphabets;
233 ///
234 /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
235 /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
236 /// ```
237 pub fn new(alphabet: &Alphabet) -> Self {
238 let mut ranks = VecMap::new();
239 for (r, c) in alphabet.symbols.iter().enumerate() {
240 ranks.insert(c, r as u8);
241 }
242
243 RankTransform { ranks }
244 }
245
246 /// Get the rank of symbol `a`.
247 ///
248 /// This method panics for characters not contained in the alphabet.
249 ///
250 /// Complexity: O(1)
251 ///
252 /// # Example
253 ///
254 /// ```
255 /// use bio::alphabets;
256 ///
257 /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
258 /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
259 /// assert_eq!(dna_ranks.get(65), 0); // "A"
260 /// assert_eq!(dna_ranks.get(116), 7); // "t"
261 /// ```
262 #[inline]
263 pub fn get(&self, a: u8) -> u8 {
264 *self.ranks.get(a as usize).expect("Unexpected character.")
265 }
266
267 /// Transform a given `text` into a vector of rank values.
268 ///
269 /// Complexity: O(n), where n is the length of the text.
270 ///
271 /// # Example
272 ///
273 /// ```
274 /// use bio::alphabets;
275 ///
276 /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
277 /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
278 /// let text = b"aAcCgGtT";
279 /// assert_eq!(dna_ranks.transform(text), vec![4, 0, 5, 1, 6, 2, 7, 3]);
280 /// ```
281 pub fn transform<C, T>(&self, text: T) -> Vec<u8>
282 where
283 C: Borrow<u8>,
284 T: IntoIterator<Item = C>,
285 {
286 text.into_iter()
287 .map(|c| {
288 *self
289 .ranks
290 .get(*c.borrow() as usize)
291 .expect("Unexpected character in text.")
292 })
293 .collect()
294 }
295
296 /// Iterate over q-grams (substrings of length q) of given `text`. The q-grams are encoded
297 /// as `usize` by storing the symbol ranks in log2(|A|) bits (with |A| being the alphabet size).
298 ///
299 /// If q is larger than usize::BITS / log2(|A|), this method fails with an assertion.
300 ///
301 /// Complexity: O(n), where n is the length of the text.
302 ///
303 /// # Example
304 ///
305 /// ```
306 /// use bio::alphabets;
307 ///
308 /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
309 /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
310 ///
311 /// let q_grams: Vec<usize> = dna_ranks.qgrams(2, b"ACGT").collect();
312 /// assert_eq!(q_grams, vec![1, 10, 19]);
313 /// ```
314 pub fn qgrams<C, T>(&self, q: u32, text: T) -> QGrams<'_, C, T::IntoIter>
315 where
316 C: Borrow<u8>,
317 T: IntoIterator<Item = C>,
318 {
319 assert!(q > 0, "Expecting q-gram length q to be larger than 0.");
320 let bits = (self.ranks.len() as f32).log2().ceil() as u32;
321 assert!(
322 bits * q <= usize::BITS,
323 "Expecting q to be smaller than usize / log2(|A|)"
324 );
325
326 let mut qgrams = QGrams {
327 text: text.into_iter(),
328 ranks: self,
329 q,
330 bits,
331 mask: 1usize.checked_shl(q * bits).unwrap_or(0).wrapping_sub(1),
332 qgram: 0,
333 };
334
335 for _ in 0..q - 1 {
336 qgrams.next();
337 }
338
339 qgrams
340 }
341
342 /// Iterate over q-grams (substrings of length q) of given `text` in reverse. The q-grams are encoded
343 /// as `usize` by storing the symbol ranks in log2(|A|) bits (with |A| being the alphabet size).
344 ///
345 /// If q is larger than usize::BITS / log2(|A|), this method fails with an assertion.
346 ///
347 /// Complexity: O(n), where n is the length of the text.
348 ///
349 /// # Example
350 ///
351 /// ```
352 /// use bio::alphabets;
353 ///
354 /// let dna_alphabet = alphabets::Alphabet::new(b"ACGTacgt");
355 /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
356 ///
357 /// let q_grams: Vec<usize> = dna_ranks.rev_qgrams(2, b"ACGT").collect();
358 /// assert_eq!(q_grams, vec![19, 10, 1]);
359 /// ```
360 pub fn rev_qgrams<C, IT, T>(&self, q: u32, text: IT) -> RevQGrams<'_, C, T>
361 where
362 C: Borrow<u8>,
363 T: DoubleEndedIterator<Item = C>,
364 IT: IntoIterator<IntoIter = T>,
365 {
366 assert!(q > 0, "Expecting q-gram length q to be larger than 0.");
367 let bits = (self.ranks.len() as f32).log2().ceil() as u32;
368 assert!(
369 bits * q <= usize::BITS,
370 "Expecting q to be smaller than usize / log2(|A|)"
371 );
372
373 let mut rev_qgrams = RevQGrams {
374 text: text.into_iter(),
375 ranks: self,
376 q,
377 bits,
378 left_shift: (q - 1) * bits,
379 qgram: 0,
380 };
381
382 for _ in 0..q - 1 {
383 rev_qgrams.next();
384 }
385
386 rev_qgrams
387 }
388
389 /// Restore alphabet from transform.
390 ///
391 /// Complexity: O(n), where n is the number of symbols in the alphabet.
392 ///
393 /// # Example
394 ///
395 /// ```
396 /// use bio::alphabets;
397 ///
398 /// let dna_alphabet = alphabets::Alphabet::new(b"acgtACGT");
399 /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
400 /// assert_eq!(dna_ranks.alphabet().symbols, dna_alphabet.symbols);
401 /// ```
402 pub fn alphabet(&self) -> Alphabet {
403 let mut symbols = BitSet::with_capacity(self.ranks.len());
404 symbols.extend(self.ranks.keys());
405 Alphabet { symbols }
406 }
407
408 /// Compute the number of bits required to encode the largest rank value.
409 ///
410 /// For example, the alphabet `b"ACGT"` with 4 symbols has the maximal rank
411 /// 3, which can be encoded in 2 bits.
412 ///
413 /// This value can be used to create a `data_structures::bitenc::BitEnc`
414 /// bit encoding tailored to the given alphabet.
415 ///
416 /// Complexity: O(n), where n is the number of symbols in the alphabet.
417 ///
418 /// # Example
419 ///
420 /// ```
421 /// use bio::alphabets;
422 ///
423 /// let dna_alphabet = alphabets::Alphabet::new(b"ACGT");
424 /// let dna_ranks = alphabets::RankTransform::new(&dna_alphabet);
425 /// assert_eq!(dna_ranks.get_width(), 2);
426 /// let dna_n_alphabet = alphabets::Alphabet::new(b"ACGTN");
427 /// let dna_n_ranks = alphabets::RankTransform::new(&dna_n_alphabet);
428 /// assert_eq!(dna_n_ranks.get_width(), 3);
429 /// ```
430 pub fn get_width(&self) -> usize {
431 (self.ranks.len() as f32).log2().ceil() as usize
432 }
433}
434
435/// Iterator over q-grams.
436#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize)]
437pub struct QGrams<'a, C, T>
438where
439 C: Borrow<u8>,
440 T: Iterator<Item = C>,
441{
442 text: T,
443 ranks: &'a RankTransform,
444 q: u32,
445 bits: u32,
446 mask: usize,
447 qgram: usize,
448}
449
450impl<'a, C, T> QGrams<'a, C, T>
451where
452 C: Borrow<u8>,
453 T: Iterator<Item = C>,
454{
455 /// Push a new character into the current qgram.
456 #[inline]
457 fn qgram_push(&mut self, a: u8) {
458 self.qgram <<= self.bits;
459 self.qgram |= a as usize;
460 self.qgram &= self.mask;
461 }
462}
463
464impl<'a, C, T> Iterator for QGrams<'a, C, T>
465where
466 C: Borrow<u8>,
467 T: Iterator<Item = C>,
468{
469 type Item = usize;
470
471 #[inline]
472 fn next(&mut self) -> Option<usize> {
473 match self.text.next() {
474 Some(a) => {
475 let b = self.ranks.get(*a.borrow());
476 self.qgram_push(b);
477 Some(self.qgram)
478 }
479 None => None,
480 }
481 }
482
483 fn size_hint(&self) -> (usize, Option<usize>) {
484 self.text.size_hint()
485 }
486}
487
488impl<'a, C, T> ExactSizeIterator for QGrams<'a, C, T>
489where
490 C: Borrow<u8>,
491 T: ExactSizeIterator<Item = C>,
492{
493}
494
495/// Iterator over q-grams.
496#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize)]
497pub struct RevQGrams<'a, C, T>
498where
499 C: Borrow<u8>,
500 T: DoubleEndedIterator<Item = C>,
501{
502 text: T,
503 ranks: &'a RankTransform,
504 q: u32,
505 bits: u32,
506 left_shift: u32,
507 qgram: usize,
508}
509
510impl<'a, C, T> RevQGrams<'a, C, T>
511where
512 C: Borrow<u8>,
513 T: DoubleEndedIterator<Item = C>,
514{
515 /// Push a new character into the current qgram in the reverse direction.
516 #[inline]
517 fn qgram_push_rev(&mut self, a: u8) {
518 self.qgram >>= self.bits;
519 self.qgram |= (a as usize) << self.left_shift;
520 }
521}
522
523impl<'a, C, T> Iterator for RevQGrams<'a, C, T>
524where
525 C: Borrow<u8>,
526 T: DoubleEndedIterator<Item = C>,
527{
528 type Item = usize;
529
530 #[inline]
531 fn next(&mut self) -> Option<usize> {
532 match self.text.next_back() {
533 Some(a) => {
534 let b = self.ranks.get(*a.borrow());
535 self.qgram_push_rev(b);
536 Some(self.qgram)
537 }
538 None => None,
539 }
540 }
541
542 fn size_hint(&self) -> (usize, Option<usize>) {
543 self.text.size_hint()
544 }
545}
546
547impl<'a, C, T> ExactSizeIterator for RevQGrams<'a, C, T>
548where
549 C: Borrow<u8>,
550 T: DoubleEndedIterator<Item = C> + ExactSizeIterator<Item = C>,
551{
552}
553
554/// Returns the english ascii lower case alphabet.
555pub fn english_ascii_lower_alphabet() -> Alphabet {
556 Alphabet::new(&b"abcdefghijklmnopqrstuvwxyz"[..])
557}
558
559/// Returns the english ascii upper case alphabet.
560pub fn english_ascii_upper_alphabet() -> Alphabet {
561 Alphabet::new(&b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"[..])
562}
563
564#[cfg(test)]
565mod tests {
566 use super::*;
567
568 #[test]
569 fn test_alphabet_eq() {
570 assert_eq!(Alphabet::new(b"ATCG"), Alphabet::new(b"ATCG"));
571 assert_eq!(Alphabet::new(b"ATCG"), Alphabet::new(b"TAGC"));
572 assert_ne!(Alphabet::new(b"ATCG"), Alphabet::new(b"ATC"));
573 }
574
575 /// When `q * bits == usize::BITS`, make sure that `1<<(q*bits)` does not overflow.
576 #[test]
577 fn test_qgram_shiftleft_overflow() {
578 let alphabet = Alphabet::new(b"ACTG");
579 let transform = RankTransform::new(&alphabet);
580 let text = b"ACTG".repeat(100);
581 transform.qgrams(usize::BITS / 2, text);
582 }
583
584 #[test]
585 fn test_qgrams() {
586 let alphabet = Alphabet::new(b"ACTG");
587 let transform = RankTransform::new(&alphabet);
588 let text = b"ACTGA";
589 let mut qgrams = transform.qgrams(4, text);
590 assert_eq!(qgrams.next(), transform.qgrams(4, b"ACTG").next());
591 assert_eq!(qgrams.next(), transform.qgrams(4, b"CTGA").next());
592 assert_eq!(qgrams.next(), None);
593 }
594
595 #[test]
596 fn test_rev_qgrams() {
597 let alphabet = Alphabet::new(b"ACTG");
598 let transform = RankTransform::new(&alphabet);
599 let text = b"ACTGA";
600 let mut rev_qgrams = transform.rev_qgrams(4, text);
601 assert_eq!(rev_qgrams.next(), transform.qgrams(4, b"CTGA").next());
602 assert_eq!(rev_qgrams.next(), transform.qgrams(4, b"ACTG").next());
603 assert_eq!(rev_qgrams.next(), None);
604 }
605
606 #[test]
607 fn test_exactsize_iterator() {
608 let alphabet = Alphabet::new(b"ACTG");
609 let transform = RankTransform::new(&alphabet);
610 let text = b"ACTGACTG";
611
612 let mut qgrams = transform.qgrams(4, text);
613 assert_eq!(qgrams.len(), 5);
614 qgrams.next();
615 assert_eq!(qgrams.len(), 4);
616
617 let mut rev_qgrams = transform.rev_qgrams(4, text);
618 assert_eq!(rev_qgrams.len(), 5);
619 rev_qgrams.next();
620 assert_eq!(rev_qgrams.len(), 4);
621
622 let qgrams = transform.qgrams(4, b"AC");
623 assert_eq!(qgrams.len(), 0);
624 let rev_qgrams = transform.rev_qgrams(4, b"AC");
625 assert_eq!(rev_qgrams.len(), 0);
626 }
627}