1use std::fmt::Debug;
4use std::ops::Index;
5
6use generic_array::ArrayLength;
7use generic_array::GenericArray;
8use typenum::consts::U21;
9use typenum::consts::U5;
10use typenum::marker_traits::NonZero;
11use typenum::marker_traits::Unsigned;
12
13use super::err::InvalidData;
14use super::err::InvalidSymbol;
15use super::seq::SymbolCount;
16
17pub trait Symbol: Default + Sized + Copy + Eq {
21 fn as_index(&self) -> usize;
23 fn as_char(&self) -> char {
25 self.as_ascii() as char
26 }
27 fn from_char(c: char) -> Result<Self, InvalidSymbol> {
29 if c.is_ascii() {
30 Self::from_ascii(c as u8)
31 } else {
32 Err(InvalidSymbol(c))
33 }
34 }
35 fn as_ascii(&self) -> u8;
37 fn from_ascii(c: u8) -> Result<Self, InvalidSymbol>;
39}
40
41pub trait ComplementableSymbol: Symbol {
43 fn complement(&self) -> Self;
45}
46
47pub trait Alphabet: Debug + Copy + Default + 'static {
51 type Symbol: Symbol + Debug;
52 type K: Unsigned + NonZero + ArrayLength + Debug;
53
54 #[inline]
56 fn default_symbol() -> Self::Symbol {
57 Default::default()
58 }
59
60 fn symbols() -> &'static [Self::Symbol];
62
63 fn as_str() -> &'static str;
65}
66
67pub trait ComplementableAlphabet: Alphabet {
71 fn complement(s: Self::Symbol) -> Self::Symbol;
73}
74
75impl<A: Alphabet> ComplementableAlphabet for A
76where
77 <A as Alphabet>::Symbol: ComplementableSymbol,
78{
79 #[inline]
80 fn complement(s: Self::Symbol) -> Self::Symbol {
81 s.complement()
82 }
83}
84
85#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
89pub struct Dna;
90
91impl Alphabet for Dna {
92 type Symbol = Nucleotide;
93 type K = U5;
94
95 #[inline]
96 fn symbols() -> &'static [Nucleotide] {
97 &[
98 Nucleotide::A,
99 Nucleotide::C,
100 Nucleotide::T,
101 Nucleotide::G,
102 Nucleotide::N,
103 ]
104 }
105
106 #[inline]
107 fn as_str() -> &'static str {
108 "ACTGN"
109 }
110}
111
112#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
114#[repr(u8)]
115pub enum Nucleotide {
116 A = 0,
120 C = 1,
124 T = 2,
128 G = 3,
132 #[default]
134 N = 4,
135}
136
137impl From<Nucleotide> for char {
138 #[inline]
139 fn from(n: Nucleotide) -> char {
140 n.as_char()
141 }
142}
143
144impl Symbol for Nucleotide {
145 #[inline]
146 fn as_index(&self) -> usize {
147 *self as usize
148 }
149
150 #[inline]
151 fn as_ascii(&self) -> u8 {
152 match self {
153 Nucleotide::A => b'A',
154 Nucleotide::C => b'C',
155 Nucleotide::T => b'T',
156 Nucleotide::G => b'G',
157 Nucleotide::N => b'N',
158 }
159 }
160
161 #[inline]
162 fn from_ascii(c: u8) -> Result<Self, InvalidSymbol> {
163 match c {
164 b'A' => Ok(Nucleotide::A),
165 b'C' => Ok(Nucleotide::C),
166 b'T' => Ok(Nucleotide::T),
167 b'G' => Ok(Nucleotide::G),
168 b'N' => Ok(Nucleotide::N),
169 _ => Err(InvalidSymbol(c as char)),
170 }
171 }
172}
173
174impl ComplementableSymbol for Nucleotide {
175 #[inline]
176 fn complement(&self) -> Self {
177 match *self {
178 Nucleotide::A => Nucleotide::T,
179 Nucleotide::T => Nucleotide::A,
180 Nucleotide::G => Nucleotide::C,
181 Nucleotide::C => Nucleotide::G,
182 Nucleotide::N => Nucleotide::N,
183 }
184 }
185}
186
187#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
191pub struct Protein;
192
193impl Alphabet for Protein {
194 type Symbol = AminoAcid;
195 type K = U21;
196
197 #[inline]
198 fn symbols() -> &'static [AminoAcid] {
199 &[
200 AminoAcid::A,
201 AminoAcid::C,
202 AminoAcid::D,
203 AminoAcid::E,
204 AminoAcid::F,
205 AminoAcid::G,
206 AminoAcid::H,
207 AminoAcid::I,
208 AminoAcid::K,
209 AminoAcid::L,
210 AminoAcid::M,
211 AminoAcid::N,
212 AminoAcid::P,
213 AminoAcid::Q,
214 AminoAcid::R,
215 AminoAcid::S,
216 AminoAcid::T,
217 AminoAcid::V,
218 AminoAcid::W,
219 AminoAcid::Y,
220 AminoAcid::X,
221 ]
222 }
223
224 #[inline]
225 fn as_str() -> &'static str {
226 "ACDEFGHIKLMNPQRSTVWYX"
227 }
228}
229
230#[derive(Clone, Copy, Default, Debug, PartialEq, Eq)]
232#[repr(u8)]
233pub enum AminoAcid {
234 A = 0,
235 C = 1,
236 D = 2,
237 E = 3,
238 F = 4,
239 G = 5,
240 H = 6,
241 I = 7,
242 K = 8,
243 L = 9,
244 M = 10,
245 N = 11,
246 P = 12,
247 Q = 13,
248 R = 14,
249 S = 15,
250 T = 16,
251 V = 17,
252 W = 18,
253 Y = 19,
254 #[default]
255 X = 20,
256}
257
258impl From<AminoAcid> for char {
259 #[inline]
260 fn from(aa: AminoAcid) -> char {
261 aa.as_char()
262 }
263}
264
265impl Symbol for AminoAcid {
266 #[inline]
267 fn as_index(&self) -> usize {
268 *self as usize
269 }
270
271 #[inline]
272 fn as_ascii(&self) -> u8 {
273 match self {
274 AminoAcid::A => b'A',
275 AminoAcid::C => b'C',
276 AminoAcid::D => b'D',
277 AminoAcid::E => b'E',
278 AminoAcid::F => b'F',
279 AminoAcid::G => b'G',
280 AminoAcid::H => b'H',
281 AminoAcid::I => b'I',
282 AminoAcid::K => b'K',
283 AminoAcid::L => b'L',
284 AminoAcid::M => b'M',
285 AminoAcid::N => b'N',
286 AminoAcid::P => b'P',
287 AminoAcid::Q => b'Q',
288 AminoAcid::R => b'R',
289 AminoAcid::S => b'S',
290 AminoAcid::T => b'T',
291 AminoAcid::V => b'V',
292 AminoAcid::W => b'W',
293 AminoAcid::Y => b'Y',
294 AminoAcid::X => b'X',
295 }
296 }
297
298 #[inline]
299 fn from_ascii(c: u8) -> Result<Self, InvalidSymbol> {
300 match c {
301 b'A' => Ok(AminoAcid::A),
302 b'C' => Ok(AminoAcid::C),
303 b'D' => Ok(AminoAcid::D),
304 b'E' => Ok(AminoAcid::E),
305 b'F' => Ok(AminoAcid::F),
306 b'G' => Ok(AminoAcid::G),
307 b'H' => Ok(AminoAcid::H),
308 b'I' => Ok(AminoAcid::I),
309 b'K' => Ok(AminoAcid::K),
310 b'L' => Ok(AminoAcid::L),
311 b'M' => Ok(AminoAcid::M),
312 b'N' => Ok(AminoAcid::N),
313 b'P' => Ok(AminoAcid::P),
314 b'Q' => Ok(AminoAcid::Q),
315 b'R' => Ok(AminoAcid::R),
316 b'S' => Ok(AminoAcid::S),
317 b'T' => Ok(AminoAcid::T),
318 b'V' => Ok(AminoAcid::V),
319 b'W' => Ok(AminoAcid::W),
320 b'Y' => Ok(AminoAcid::Y),
321 b'X' => Ok(AminoAcid::X),
322 _ => Err(InvalidSymbol(c as char)),
323 }
324 }
325}
326
327#[derive(Clone, Debug, PartialEq)]
331pub struct Background<A: Alphabet> {
332 frequencies: GenericArray<f32, A::K>,
333 alphabet: std::marker::PhantomData<A>,
334}
335
336impl<A: Alphabet> Background<A> {
337 pub fn new<F>(frequencies: F) -> Result<Self, InvalidData>
342 where
343 F: Into<GenericArray<f32, A::K>>,
344 {
345 let frequencies = frequencies.into();
346 let mut sum = 0.0;
347 for &f in frequencies.iter() {
348 if !(0.0..=1.0).contains(&f) {
349 return Err(InvalidData);
350 }
351 sum += f;
352 }
353 if sum != 1.0 {
354 return Err(InvalidData);
355 }
356 Ok(Self {
357 frequencies,
358 alphabet: std::marker::PhantomData,
359 })
360 }
361
362 #[doc(hidden)]
364 pub fn new_unchecked<F>(frequencies: F) -> Self
365 where
366 F: Into<GenericArray<f32, A::K>>,
367 {
368 Self {
369 frequencies: frequencies.into(),
370 alphabet: std::marker::PhantomData,
371 }
372 }
373
374 pub fn from_counts(counts: &GenericArray<usize, A::K>) -> Result<Self, InvalidData> {
390 let total = counts.iter().sum::<usize>();
391 if total == 0 {
392 return Err(InvalidData);
393 }
394 let mut frequencies = GenericArray::<f32, A::K>::default();
395 for c in A::symbols() {
396 frequencies[c.as_index()] = counts[c.as_index()] as f32 / total as f32;
397 }
398 Ok(Self {
399 frequencies,
400 alphabet: std::marker::PhantomData,
401 })
402 }
403
404 pub fn from_sequence<S>(sequence: S, unknown: bool) -> Result<Self, InvalidData>
423 where
424 S: SymbolCount<A>,
425 {
426 let n = A::Symbol::default();
427 let mut base_counts = GenericArray::<usize, A::K>::default();
428 for &c in A::symbols() {
429 if unknown || c != n {
430 base_counts[c.as_index()] += sequence.count_symbol(c);
431 }
432 }
433 Self::from_counts(&base_counts)
434 }
435
436 pub fn from_sequences<I>(sequences: I, unknown: bool) -> Result<Self, InvalidData>
442 where
443 I: IntoIterator,
444 <I as IntoIterator>::Item: SymbolCount<A>,
445 {
446 let n = A::Symbol::default();
447 let mut base_counts = GenericArray::<usize, A::K>::default();
448 for seq in sequences {
449 for &c in A::symbols() {
450 if unknown || c != n {
451 base_counts[c.as_index()] += seq.count_symbol(c);
452 }
453 }
454 }
455 Self::from_counts(&base_counts)
456 }
457
458 pub fn uniform() -> Self {
474 let frequencies = (0..A::K::USIZE)
475 .map(|i| {
476 if i != A::Symbol::default().as_index() {
477 1.0 / ((A::K::USIZE - 1) as f32)
478 } else {
479 0.0
480 }
481 })
482 .collect();
483 Self {
484 frequencies,
485 alphabet: std::marker::PhantomData,
486 }
487 }
488
489 #[inline]
491 pub fn frequencies(&self) -> &[f32] {
492 &self.frequencies
493 }
494}
495
496impl<A: Alphabet> AsRef<[f32]> for Background<A> {
497 #[inline]
498 fn as_ref(&self) -> &[f32] {
499 self.frequencies()
500 }
501}
502
503impl<A: Alphabet> AsRef<GenericArray<f32, A::K>> for Background<A> {
504 #[inline]
505 fn as_ref(&self) -> &GenericArray<f32, A::K> {
506 &self.frequencies
507 }
508}
509
510impl<A: Alphabet> Default for Background<A> {
511 #[inline]
512 fn default() -> Self {
513 Self::uniform()
514 }
515}
516
517impl<A: Alphabet> Index<A::Symbol> for Background<A> {
518 type Output = f32;
519 #[inline]
520 fn index(&self, index: A::Symbol) -> &Self::Output {
521 &self.frequencies()[index.as_index()]
522 }
523}
524
525#[derive(Clone, Debug, PartialEq)]
529pub struct Pseudocounts<A: Alphabet> {
530 counts: GenericArray<f32, A::K>,
531 alphabet: std::marker::PhantomData<A>,
532}
533
534impl<A: Alphabet> Pseudocounts<A> {
535 #[inline]
536 pub fn counts(&self) -> &GenericArray<f32, A::K> {
537 &self.counts
538 }
539}
540
541impl<A: Alphabet> Default for Pseudocounts<A> {
542 #[inline]
543 fn default() -> Self {
544 Self::from(0.0)
545 }
546}
547
548impl<A: Alphabet> From<GenericArray<f32, A::K>> for Pseudocounts<A> {
549 #[inline]
550 fn from(counts: GenericArray<f32, A::K>) -> Self {
551 Self {
552 alphabet: std::marker::PhantomData,
553 counts,
554 }
555 }
556}
557
558impl<A: Alphabet> From<f32> for Pseudocounts<A> {
559 fn from(count: f32) -> Self {
560 let counts = (0..A::K::USIZE)
561 .map(|i| {
562 if i != A::Symbol::default().as_index() {
563 count
564 } else {
565 0.0
566 }
567 })
568 .collect();
569 Self {
570 counts,
571 alphabet: std::marker::PhantomData,
572 }
573 }
574}
575
576impl<A: Alphabet> AsRef<[f32]> for Pseudocounts<A> {
577 #[inline]
578 fn as_ref(&self) -> &[f32] {
579 &self.counts
580 }
581}
582
583impl<A: Alphabet> AsMut<[f32]> for Pseudocounts<A> {
584 #[inline]
585 fn as_mut(&mut self) -> &mut [f32] {
586 &mut self.counts
587 }
588}
589
590#[cfg(test)]
591mod test {
592 use super::*;
593
594 #[test]
595 fn background_new() {
596 assert!(Background::<Dna>::new([0.3, 0.2, 0.2, 0.3, 0.0]).is_ok());
597 assert!(Background::<Dna>::new([0.1, 0.1, 0.1, 0.1, 0.0]).is_err());
598 }
599}