lightmotif/
abc.rs

1//! Digital encoding for biological sequences using an alphabet.
2
3use std::fmt::Debug;
4use std::ops::Index;
5
6use generic_array::ArrayLength;
7use generic_array::GenericArray;
8use typenum::consts::U21;
9use typenum::consts::U5;
10use typenum::marker_traits::NonZero;
11use typenum::marker_traits::Unsigned;
12
13use super::err::InvalidData;
14use super::err::InvalidSymbol;
15use super::seq::SymbolCount;
16
17// --- Symbol ------------------------------------------------------------------
18
19/// A symbol from a biological alphabet.
20pub trait Symbol: Default + Sized + Copy + Eq {
21    /// View this symbol as a zero-based index.
22    fn as_index(&self) -> usize;
23    /// View this symbol as a string character.
24    fn as_char(&self) -> char {
25        self.as_ascii() as char
26    }
27    /// Parse a string character into a symbol.
28    fn from_char(c: char) -> Result<Self, InvalidSymbol> {
29        if c.is_ascii() {
30            Self::from_ascii(c as u8)
31        } else {
32            Err(InvalidSymbol(c))
33        }
34    }
35    /// View this symbol as an ASCII charater.
36    fn as_ascii(&self) -> u8;
37    /// Parse an ASCII character into a symbol.
38    fn from_ascii(c: u8) -> Result<Self, InvalidSymbol>;
39}
40
41/// A symbol that can be complemented.
42pub trait ComplementableSymbol: Symbol {
43    /// Get the complement of this symbol.
44    fn complement(&self) -> Self;
45}
46
47// --- Alphabet ----------------------------------------------------------------
48
49/// A biological alphabet with associated metadata.
50pub trait Alphabet: Debug + Copy + Default + 'static {
51    type Symbol: Symbol + Debug;
52    type K: Unsigned + NonZero + ArrayLength + Debug;
53
54    /// Get the default symbol for this alphabet.
55    #[inline]
56    fn default_symbol() -> Self::Symbol {
57        Default::default()
58    }
59
60    /// Get all the symbols of this alphabet.
61    fn symbols() -> &'static [Self::Symbol];
62
63    /// Get a string with all symbols from this alphabet.
64    fn as_str() -> &'static str;
65}
66
67// --- ComplementableAlphabet --------------------------------------------------
68
69/// An alphabet that defines the complement operation.
70pub trait ComplementableAlphabet: Alphabet {
71    /// Get the complement of this symbol.
72    fn complement(s: Self::Symbol) -> Self::Symbol;
73}
74
75impl<A: Alphabet> ComplementableAlphabet for A
76where
77    <A as Alphabet>::Symbol: ComplementableSymbol,
78{
79    #[inline]
80    fn complement(s: Self::Symbol) -> Self::Symbol {
81        s.complement()
82    }
83}
84
85// --- DNA ---------------------------------------------------------------------
86
87/// The standard DNA alphabet composed of 4 deoxyribonucleotides and a wildcard.
88#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
89pub struct Dna;
90
91impl Alphabet for Dna {
92    type Symbol = Nucleotide;
93    type K = U5;
94
95    #[inline]
96    fn symbols() -> &'static [Nucleotide] {
97        &[
98            Nucleotide::A,
99            Nucleotide::C,
100            Nucleotide::T,
101            Nucleotide::G,
102            Nucleotide::N,
103        ]
104    }
105
106    #[inline]
107    fn as_str() -> &'static str {
108        "ACTGN"
109    }
110}
111
112/// A deoxyribonucleotide.
113#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
114#[repr(u8)]
115pub enum Nucleotide {
116    /// Adenine.
117    ///
118    /// ![adenine.png](https://www.ebi.ac.uk/chebi/displayImage.do?defaultImage=true&imageIndex=0&chebiId=16708)
119    A = 0,
120    /// Cytosine.
121    ///
122    /// ![cytosine.png](https://www.ebi.ac.uk/chebi/displayImage.do?defaultImage=true&imageIndex=0&chebiId=16040)
123    C = 1,
124    /// Thymine.
125    ///
126    /// ![thymine.png](https://www.ebi.ac.uk/chebi/displayImage.do?defaultImage=true&imageIndex=0&chebiId=17821)
127    T = 2,
128    /// Guanine.
129    ///
130    /// ![guanine.png](https://www.ebi.ac.uk/chebi/displayImage.do?defaultImage=true&imageIndex=0&chebiId=16235)
131    G = 3,
132    /// Unknown base.
133    #[default]
134    N = 4,
135}
136
137impl From<Nucleotide> for char {
138    #[inline]
139    fn from(n: Nucleotide) -> char {
140        n.as_char()
141    }
142}
143
144impl Symbol for Nucleotide {
145    #[inline]
146    fn as_index(&self) -> usize {
147        *self as usize
148    }
149
150    #[inline]
151    fn as_ascii(&self) -> u8 {
152        match self {
153            Nucleotide::A => b'A',
154            Nucleotide::C => b'C',
155            Nucleotide::T => b'T',
156            Nucleotide::G => b'G',
157            Nucleotide::N => b'N',
158        }
159    }
160
161    #[inline]
162    fn from_ascii(c: u8) -> Result<Self, InvalidSymbol> {
163        match c {
164            b'A' => Ok(Nucleotide::A),
165            b'C' => Ok(Nucleotide::C),
166            b'T' => Ok(Nucleotide::T),
167            b'G' => Ok(Nucleotide::G),
168            b'N' => Ok(Nucleotide::N),
169            _ => Err(InvalidSymbol(c as char)),
170        }
171    }
172}
173
174impl ComplementableSymbol for Nucleotide {
175    #[inline]
176    fn complement(&self) -> Self {
177        match *self {
178            Nucleotide::A => Nucleotide::T,
179            Nucleotide::T => Nucleotide::A,
180            Nucleotide::G => Nucleotide::C,
181            Nucleotide::C => Nucleotide::G,
182            Nucleotide::N => Nucleotide::N,
183        }
184    }
185}
186
187// --- Protein -----------------------------------------------------------------
188
189/// The standard protein alphabet composed of 20 residues and a wildcard.
190#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
191pub struct Protein;
192
193impl Alphabet for Protein {
194    type Symbol = AminoAcid;
195    type K = U21;
196
197    #[inline]
198    fn symbols() -> &'static [AminoAcid] {
199        &[
200            AminoAcid::A,
201            AminoAcid::C,
202            AminoAcid::D,
203            AminoAcid::E,
204            AminoAcid::F,
205            AminoAcid::G,
206            AminoAcid::H,
207            AminoAcid::I,
208            AminoAcid::K,
209            AminoAcid::L,
210            AminoAcid::M,
211            AminoAcid::N,
212            AminoAcid::P,
213            AminoAcid::Q,
214            AminoAcid::R,
215            AminoAcid::S,
216            AminoAcid::T,
217            AminoAcid::V,
218            AminoAcid::W,
219            AminoAcid::Y,
220            AminoAcid::X,
221        ]
222    }
223
224    #[inline]
225    fn as_str() -> &'static str {
226        "ACDEFGHIKLMNPQRSTVWYX"
227    }
228}
229
230/// A proteinogenic amino acid.
231#[derive(Clone, Copy, Default, Debug, PartialEq, Eq)]
232#[repr(u8)]
233pub enum AminoAcid {
234    A = 0,
235    C = 1,
236    D = 2,
237    E = 3,
238    F = 4,
239    G = 5,
240    H = 6,
241    I = 7,
242    K = 8,
243    L = 9,
244    M = 10,
245    N = 11,
246    P = 12,
247    Q = 13,
248    R = 14,
249    S = 15,
250    T = 16,
251    V = 17,
252    W = 18,
253    Y = 19,
254    #[default]
255    X = 20,
256}
257
258impl From<AminoAcid> for char {
259    #[inline]
260    fn from(aa: AminoAcid) -> char {
261        aa.as_char()
262    }
263}
264
265impl Symbol for AminoAcid {
266    #[inline]
267    fn as_index(&self) -> usize {
268        *self as usize
269    }
270
271    #[inline]
272    fn as_ascii(&self) -> u8 {
273        match self {
274            AminoAcid::A => b'A',
275            AminoAcid::C => b'C',
276            AminoAcid::D => b'D',
277            AminoAcid::E => b'E',
278            AminoAcid::F => b'F',
279            AminoAcid::G => b'G',
280            AminoAcid::H => b'H',
281            AminoAcid::I => b'I',
282            AminoAcid::K => b'K',
283            AminoAcid::L => b'L',
284            AminoAcid::M => b'M',
285            AminoAcid::N => b'N',
286            AminoAcid::P => b'P',
287            AminoAcid::Q => b'Q',
288            AminoAcid::R => b'R',
289            AminoAcid::S => b'S',
290            AminoAcid::T => b'T',
291            AminoAcid::V => b'V',
292            AminoAcid::W => b'W',
293            AminoAcid::Y => b'Y',
294            AminoAcid::X => b'X',
295        }
296    }
297
298    #[inline]
299    fn from_ascii(c: u8) -> Result<Self, InvalidSymbol> {
300        match c {
301            b'A' => Ok(AminoAcid::A),
302            b'C' => Ok(AminoAcid::C),
303            b'D' => Ok(AminoAcid::D),
304            b'E' => Ok(AminoAcid::E),
305            b'F' => Ok(AminoAcid::F),
306            b'G' => Ok(AminoAcid::G),
307            b'H' => Ok(AminoAcid::H),
308            b'I' => Ok(AminoAcid::I),
309            b'K' => Ok(AminoAcid::K),
310            b'L' => Ok(AminoAcid::L),
311            b'M' => Ok(AminoAcid::M),
312            b'N' => Ok(AminoAcid::N),
313            b'P' => Ok(AminoAcid::P),
314            b'Q' => Ok(AminoAcid::Q),
315            b'R' => Ok(AminoAcid::R),
316            b'S' => Ok(AminoAcid::S),
317            b'T' => Ok(AminoAcid::T),
318            b'V' => Ok(AminoAcid::V),
319            b'W' => Ok(AminoAcid::W),
320            b'Y' => Ok(AminoAcid::Y),
321            b'X' => Ok(AminoAcid::X),
322            _ => Err(InvalidSymbol(c as char)),
323        }
324    }
325}
326
327// --- Background --------------------------------------------------------------
328
329/// The background frequencies for an alphabet.
330#[derive(Clone, Debug, PartialEq)]
331pub struct Background<A: Alphabet> {
332    frequencies: GenericArray<f32, A::K>,
333    alphabet: std::marker::PhantomData<A>,
334}
335
336impl<A: Alphabet> Background<A> {
337    /// Create a new background with the given frequencies.
338    ///
339    /// The array must contain valid frequencies, i.e. real numbers between
340    /// zero and one that sum to one.
341    pub fn new<F>(frequencies: F) -> Result<Self, InvalidData>
342    where
343        F: Into<GenericArray<f32, A::K>>,
344    {
345        let frequencies = frequencies.into();
346        let mut sum = 0.0;
347        for &f in frequencies.iter() {
348            if !(0.0..=1.0).contains(&f) {
349                return Err(InvalidData);
350            }
351            sum += f;
352        }
353        if sum != 1.0 {
354            return Err(InvalidData);
355        }
356        Ok(Self {
357            frequencies,
358            alphabet: std::marker::PhantomData,
359        })
360    }
361
362    /// Create a new background without checking frequencies.
363    #[doc(hidden)]
364    pub fn new_unchecked<F>(frequencies: F) -> Self
365    where
366        F: Into<GenericArray<f32, A::K>>,
367    {
368        Self {
369            frequencies: frequencies.into(),
370            alphabet: std::marker::PhantomData,
371        }
372    }
373
374    /// Create a new background from the given symbol counts.
375    ///
376    /// # Example
377    /// ```rust
378    /// # use generic_array::GenericArray;
379    /// # use lightmotif::abc::Background;
380    /// # use lightmotif::abc::Dna;
381    /// # use lightmotif::abc::Nucleotide::*;
382    /// let counts = GenericArray::from([ 2, 2, 5, 1, 0 ]);
383    /// let background = Background::<Dna>::from_counts(&counts).unwrap();
384    /// assert_eq!(background[A], 0.2);
385    /// assert_eq!(background[C], 0.2);
386    /// assert_eq!(background[T], 0.5);
387    /// assert_eq!(background[G], 0.1);
388    /// ```
389    pub fn from_counts(counts: &GenericArray<usize, A::K>) -> Result<Self, InvalidData> {
390        let total = counts.iter().sum::<usize>();
391        if total == 0 {
392            return Err(InvalidData);
393        }
394        let mut frequencies = GenericArray::<f32, A::K>::default();
395        for c in A::symbols() {
396            frequencies[c.as_index()] = counts[c.as_index()] as f32 / total as f32;
397        }
398        Ok(Self {
399            frequencies,
400            alphabet: std::marker::PhantomData,
401        })
402    }
403
404    /// Create a new background by counting symbol occurences in the given sequence.
405    ///
406    /// Pass `true` as the `unknown` argument to count the unknown symbol when
407    /// computing frequencies, or `false` to only count frequencies of known
408    /// symbols.
409    ///
410    /// # Example
411    /// ```rust
412    /// # use lightmotif::abc::Background;
413    /// # use lightmotif::abc::Dna;
414    /// # use lightmotif::abc::Nucleotide::*;
415    /// let sequence = [ T, T, A, T, G, T, T, A, C, C ];
416    /// let background = Background::<Dna>::from_sequence(&sequence[..], false).unwrap();
417    /// assert_eq!(background[A], 0.2);
418    /// assert_eq!(background[C], 0.2);
419    /// assert_eq!(background[T], 0.5);
420    /// assert_eq!(background[G], 0.1);
421    /// ```
422    pub fn from_sequence<S>(sequence: S, unknown: bool) -> Result<Self, InvalidData>
423    where
424        S: SymbolCount<A>,
425    {
426        let n = A::Symbol::default();
427        let mut base_counts = GenericArray::<usize, A::K>::default();
428        for &c in A::symbols() {
429            if unknown || c != n {
430                base_counts[c.as_index()] += sequence.count_symbol(c);
431            }
432        }
433        Self::from_counts(&base_counts)
434    }
435
436    /// Create a new background by counting symbol occurences in the given sequences.
437    ///
438    /// Pass `true` as the `unknown` argument to count the unknown symbol when
439    /// computing frequencies, or `false` to only count frequencies of known
440    /// symbols.
441    pub fn from_sequences<I>(sequences: I, unknown: bool) -> Result<Self, InvalidData>
442    where
443        I: IntoIterator,
444        <I as IntoIterator>::Item: SymbolCount<A>,
445    {
446        let n = A::Symbol::default();
447        let mut base_counts = GenericArray::<usize, A::K>::default();
448        for seq in sequences {
449            for &c in A::symbols() {
450                if unknown || c != n {
451                    base_counts[c.as_index()] += seq.count_symbol(c);
452                }
453            }
454        }
455        Self::from_counts(&base_counts)
456    }
457
458    /// Create a new background with uniform frequencies.
459    ///
460    /// The non-default symbols from the alphabet `A` will be initialized
461    /// with a frequency of `1/(K-1)`, while the default symbol will remain
462    /// with a zero frequency.
463    ///
464    /// # Note
465    /// The `Default` implementation for `Background` uses uniform frequencies.
466    ///
467    /// # Example
468    /// ```
469    /// # use lightmotif::abc::*;
470    /// let bg = Background::<Dna>::uniform();
471    /// assert_eq!(bg.frequencies(), &[0.25, 0.25, 0.25, 0.25, 0.0]);
472    /// ```
473    pub fn uniform() -> Self {
474        let frequencies = (0..A::K::USIZE)
475            .map(|i| {
476                if i != A::Symbol::default().as_index() {
477                    1.0 / ((A::K::USIZE - 1) as f32)
478                } else {
479                    0.0
480                }
481            })
482            .collect();
483        Self {
484            frequencies,
485            alphabet: std::marker::PhantomData,
486        }
487    }
488
489    /// A reference to the raw background frequencies.
490    #[inline]
491    pub fn frequencies(&self) -> &[f32] {
492        &self.frequencies
493    }
494}
495
496impl<A: Alphabet> AsRef<[f32]> for Background<A> {
497    #[inline]
498    fn as_ref(&self) -> &[f32] {
499        self.frequencies()
500    }
501}
502
503impl<A: Alphabet> AsRef<GenericArray<f32, A::K>> for Background<A> {
504    #[inline]
505    fn as_ref(&self) -> &GenericArray<f32, A::K> {
506        &self.frequencies
507    }
508}
509
510impl<A: Alphabet> Default for Background<A> {
511    #[inline]
512    fn default() -> Self {
513        Self::uniform()
514    }
515}
516
517impl<A: Alphabet> Index<A::Symbol> for Background<A> {
518    type Output = f32;
519    #[inline]
520    fn index(&self, index: A::Symbol) -> &Self::Output {
521        &self.frequencies()[index.as_index()]
522    }
523}
524
525// --- Pseudocounts ------------------------------------------------------------
526
527/// A structure for storing the pseudocounts over an alphabet.
528#[derive(Clone, Debug, PartialEq)]
529pub struct Pseudocounts<A: Alphabet> {
530    counts: GenericArray<f32, A::K>,
531    alphabet: std::marker::PhantomData<A>,
532}
533
534impl<A: Alphabet> Pseudocounts<A> {
535    #[inline]
536    pub fn counts(&self) -> &GenericArray<f32, A::K> {
537        &self.counts
538    }
539}
540
541impl<A: Alphabet> Default for Pseudocounts<A> {
542    #[inline]
543    fn default() -> Self {
544        Self::from(0.0)
545    }
546}
547
548impl<A: Alphabet> From<GenericArray<f32, A::K>> for Pseudocounts<A> {
549    #[inline]
550    fn from(counts: GenericArray<f32, A::K>) -> Self {
551        Self {
552            alphabet: std::marker::PhantomData,
553            counts,
554        }
555    }
556}
557
558impl<A: Alphabet> From<f32> for Pseudocounts<A> {
559    fn from(count: f32) -> Self {
560        let counts = (0..A::K::USIZE)
561            .map(|i| {
562                if i != A::Symbol::default().as_index() {
563                    count
564                } else {
565                    0.0
566                }
567            })
568            .collect();
569        Self {
570            counts,
571            alphabet: std::marker::PhantomData,
572        }
573    }
574}
575
576impl<A: Alphabet> AsRef<[f32]> for Pseudocounts<A> {
577    #[inline]
578    fn as_ref(&self) -> &[f32] {
579        &self.counts
580    }
581}
582
583impl<A: Alphabet> AsMut<[f32]> for Pseudocounts<A> {
584    #[inline]
585    fn as_mut(&mut self) -> &mut [f32] {
586        &mut self.counts
587    }
588}
589
590#[cfg(test)]
591mod test {
592    use super::*;
593
594    #[test]
595    fn background_new() {
596        assert!(Background::<Dna>::new([0.3, 0.2, 0.2, 0.3, 0.0]).is_ok());
597        assert!(Background::<Dna>::new([0.1, 0.1, 0.1, 0.1, 0.0]).is_err());
598    }
599}