1use std::collections::HashMap;
8use std::fmt;
9
10use crate::error::{CoreError, CoreResult};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14pub enum AminoAcid {
15 Ala,
17 Arg,
19 Asn,
21 Asp,
23 Cys,
25 Gln,
27 Glu,
29 Gly,
31 His,
33 Ile,
35 Leu,
37 Lys,
39 Met,
41 Phe,
43 Pro,
45 Ser,
47 Thr,
49 Trp,
51 Tyr,
53 Val,
55 Stop,
57}
58
59impl AminoAcid {
60 #[must_use]
64 pub fn one_letter(&self) -> char {
65 match self {
66 AminoAcid::Ala => 'A',
67 AminoAcid::Arg => 'R',
68 AminoAcid::Asn => 'N',
69 AminoAcid::Asp => 'D',
70 AminoAcid::Cys => 'C',
71 AminoAcid::Gln => 'Q',
72 AminoAcid::Glu => 'E',
73 AminoAcid::Gly => 'G',
74 AminoAcid::His => 'H',
75 AminoAcid::Ile => 'I',
76 AminoAcid::Leu => 'L',
77 AminoAcid::Lys => 'K',
78 AminoAcid::Met => 'M',
79 AminoAcid::Phe => 'F',
80 AminoAcid::Pro => 'P',
81 AminoAcid::Ser => 'S',
82 AminoAcid::Thr => 'T',
83 AminoAcid::Trp => 'W',
84 AminoAcid::Tyr => 'Y',
85 AminoAcid::Val => 'V',
86 AminoAcid::Stop => '*',
87 }
88 }
89
90 #[must_use]
92 pub fn three_letter(&self) -> &'static str {
93 match self {
94 AminoAcid::Ala => "Ala",
95 AminoAcid::Arg => "Arg",
96 AminoAcid::Asn => "Asn",
97 AminoAcid::Asp => "Asp",
98 AminoAcid::Cys => "Cys",
99 AminoAcid::Gln => "Gln",
100 AminoAcid::Glu => "Glu",
101 AminoAcid::Gly => "Gly",
102 AminoAcid::His => "His",
103 AminoAcid::Ile => "Ile",
104 AminoAcid::Leu => "Leu",
105 AminoAcid::Lys => "Lys",
106 AminoAcid::Met => "Met",
107 AminoAcid::Phe => "Phe",
108 AminoAcid::Pro => "Pro",
109 AminoAcid::Ser => "Ser",
110 AminoAcid::Thr => "Thr",
111 AminoAcid::Trp => "Trp",
112 AminoAcid::Tyr => "Tyr",
113 AminoAcid::Val => "Val",
114 AminoAcid::Stop => "Stop",
115 }
116 }
117
118 #[must_use]
120 pub fn is_stop(&self) -> bool {
121 matches!(self, AminoAcid::Stop)
122 }
123}
124
125impl fmt::Display for AminoAcid {
126 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127 write!(f, "{}", self.one_letter())
128 }
129}
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq)]
133pub enum SequenceType {
134 Dna,
136 Rna,
138}
139
140#[derive(Debug, Clone, PartialEq, Eq)]
156pub struct NucleotideSequence {
157 data: Vec<u8>,
158 seq_type: SequenceType,
159}
160
161impl NucleotideSequence {
162 pub fn new(seq: &[u8], seq_type: SequenceType) -> CoreResult<Self> {
171 let data: Vec<u8> = seq.iter().map(|&b| b.to_ascii_uppercase()).collect();
172 for (i, &base) in data.iter().enumerate() {
173 if !is_valid_base(base, seq_type) {
174 return Err(CoreError::ValueError(crate::error_context!(format!(
175 "Invalid {} base '{}' at position {}",
176 match seq_type {
177 SequenceType::Dna => "DNA",
178 SequenceType::Rna => "RNA",
179 },
180 base as char,
181 i
182 ))));
183 }
184 }
185 Ok(Self { data, seq_type })
186 }
187
188 #[must_use]
190 pub fn as_bytes(&self) -> &[u8] {
191 &self.data
192 }
193
194 #[must_use]
196 pub fn len(&self) -> usize {
197 self.data.len()
198 }
199
200 #[must_use]
202 pub fn is_empty(&self) -> bool {
203 self.data.is_empty()
204 }
205
206 #[must_use]
208 pub fn seq_type(&self) -> SequenceType {
209 self.seq_type
210 }
211
212 #[must_use]
216 pub fn gc_content(&self) -> f64 {
217 gc_content(&self.data)
218 }
219
220 pub fn reverse_complement(&self) -> CoreResult<NucleotideSequence> {
226 if self.seq_type != SequenceType::Dna {
227 return Err(CoreError::ValueError(crate::error_context!(
228 "reverse_complement is only defined for DNA sequences"
229 )));
230 }
231 let rc = reverse_complement(&self.data);
232 Ok(NucleotideSequence {
233 data: rc,
234 seq_type: SequenceType::Dna,
235 })
236 }
237
238 pub fn translate(&self) -> CoreResult<Vec<AminoAcid>> {
247 if self.seq_type != SequenceType::Dna {
248 return Err(CoreError::ValueError(crate::error_context!(
249 "translate is only defined for DNA sequences"
250 )));
251 }
252 translate(&self.data)
253 }
254
255 pub fn kmer_count(&self, k: usize) -> CoreResult<HashMap<Vec<u8>, usize>> {
261 kmer_count(&self.data, k)
262 }
263}
264
265impl fmt::Display for NucleotideSequence {
266 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
267 let s = std::str::from_utf8(&self.data).map_err(|_| fmt::Error)?;
269 write!(f, "{s}")
270 }
271}
272
273#[must_use]
279pub fn is_valid_base(base: u8, seq_type: SequenceType) -> bool {
280 match seq_type {
281 SequenceType::Dna => matches!(base, b'A' | b'T' | b'G' | b'C' | b'N'),
282 SequenceType::Rna => matches!(base, b'A' | b'U' | b'G' | b'C' | b'N'),
283 }
284}
285
286#[must_use]
290pub fn complement_base(base: u8) -> u8 {
291 match base.to_ascii_uppercase() {
292 b'A' => b'T',
293 b'T' => b'A',
294 b'G' => b'C',
295 b'C' => b'G',
296 other => other,
297 }
298}
299
300#[must_use]
314pub fn complement(seq: &[u8]) -> Vec<u8> {
315 seq.iter().map(|&b| complement_base(b)).collect()
316}
317
318#[must_use]
332pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
333 seq.iter().rev().map(|&b| complement_base(b)).collect()
334}
335
336#[must_use]
350pub fn gc_content(seq: &[u8]) -> f64 {
351 if seq.is_empty() {
352 return 0.0;
353 }
354 let gc = seq
355 .iter()
356 .filter(|&&b| {
357 let ub = b.to_ascii_uppercase();
358 ub == b'G' || ub == b'C'
359 })
360 .count();
361 gc as f64 / seq.len() as f64
362}
363
364pub fn translate(dna: &[u8]) -> CoreResult<Vec<AminoAcid>> {
385 let codon_table = build_codon_table();
386 let mut protein = Vec::new();
387
388 let upper: Vec<u8> = dna.iter().map(|&b| b.to_ascii_uppercase()).collect();
389
390 let mut i = 0;
391 while i + 3 <= upper.len() {
392 let codon: [u8; 3] = [upper[i], upper[i + 1], upper[i + 2]];
393 let aa = codon_table.get(&codon).copied().ok_or_else(|| {
394 CoreError::ValueError(crate::error_context!(format!(
395 "Unknown codon: {}{}{}",
396 codon[0] as char, codon[1] as char, codon[2] as char
397 )))
398 })?;
399 protein.push(aa);
400 if aa == AminoAcid::Stop {
401 break;
402 }
403 i += 3;
404 }
405 Ok(protein)
406}
407
408pub fn kmer_count(seq: &[u8], k: usize) -> CoreResult<HashMap<Vec<u8>, usize>> {
424 if k == 0 {
425 return Err(CoreError::ValueError(crate::error_context!(
426 "k must be at least 1"
427 )));
428 }
429 if k > seq.len() {
430 return Err(CoreError::ValueError(crate::error_context!(format!(
431 "k ({k}) must not exceed sequence length ({})",
432 seq.len()
433 ))));
434 }
435
436 let upper: Vec<u8> = seq.iter().map(|&b| b.to_ascii_uppercase()).collect();
437 let mut counts: HashMap<Vec<u8>, usize> = HashMap::new();
438
439 for window in upper.windows(k) {
440 *counts.entry(window.to_vec()).or_insert(0) += 1;
441 }
442 Ok(counts)
443}
444
445#[must_use]
450pub fn build_codon_table() -> HashMap<[u8; 3], AminoAcid> {
451 let mut t = HashMap::new();
452
453 t.insert(*b"TTT", AminoAcid::Phe);
455 t.insert(*b"TTC", AminoAcid::Phe);
456 t.insert(*b"TTA", AminoAcid::Leu);
457 t.insert(*b"TTG", AminoAcid::Leu);
458
459 t.insert(*b"TCT", AminoAcid::Ser);
461 t.insert(*b"TCC", AminoAcid::Ser);
462 t.insert(*b"TCA", AminoAcid::Ser);
463 t.insert(*b"TCG", AminoAcid::Ser);
464
465 t.insert(*b"TAT", AminoAcid::Tyr);
467 t.insert(*b"TAC", AminoAcid::Tyr);
468 t.insert(*b"TAA", AminoAcid::Stop);
469 t.insert(*b"TAG", AminoAcid::Stop);
470
471 t.insert(*b"TGT", AminoAcid::Cys);
473 t.insert(*b"TGC", AminoAcid::Cys);
474 t.insert(*b"TGA", AminoAcid::Stop);
475 t.insert(*b"TGG", AminoAcid::Trp);
476
477 t.insert(*b"CTT", AminoAcid::Leu);
479 t.insert(*b"CTC", AminoAcid::Leu);
480 t.insert(*b"CTA", AminoAcid::Leu);
481 t.insert(*b"CTG", AminoAcid::Leu);
482
483 t.insert(*b"CCT", AminoAcid::Pro);
485 t.insert(*b"CCC", AminoAcid::Pro);
486 t.insert(*b"CCA", AminoAcid::Pro);
487 t.insert(*b"CCG", AminoAcid::Pro);
488
489 t.insert(*b"CAT", AminoAcid::His);
491 t.insert(*b"CAC", AminoAcid::His);
492 t.insert(*b"CAA", AminoAcid::Gln);
493 t.insert(*b"CAG", AminoAcid::Gln);
494
495 t.insert(*b"CGT", AminoAcid::Arg);
497 t.insert(*b"CGC", AminoAcid::Arg);
498 t.insert(*b"CGA", AminoAcid::Arg);
499 t.insert(*b"CGG", AminoAcid::Arg);
500
501 t.insert(*b"ATT", AminoAcid::Ile);
503 t.insert(*b"ATC", AminoAcid::Ile);
504 t.insert(*b"ATA", AminoAcid::Ile);
505 t.insert(*b"ATG", AminoAcid::Met);
506
507 t.insert(*b"ACT", AminoAcid::Thr);
509 t.insert(*b"ACC", AminoAcid::Thr);
510 t.insert(*b"ACA", AminoAcid::Thr);
511 t.insert(*b"ACG", AminoAcid::Thr);
512
513 t.insert(*b"AAT", AminoAcid::Asn);
515 t.insert(*b"AAC", AminoAcid::Asn);
516 t.insert(*b"AAA", AminoAcid::Lys);
517 t.insert(*b"AAG", AminoAcid::Lys);
518
519 t.insert(*b"AGT", AminoAcid::Ser);
521 t.insert(*b"AGC", AminoAcid::Ser);
522 t.insert(*b"AGA", AminoAcid::Arg);
523 t.insert(*b"AGG", AminoAcid::Arg);
524
525 t.insert(*b"GTT", AminoAcid::Val);
527 t.insert(*b"GTC", AminoAcid::Val);
528 t.insert(*b"GTA", AminoAcid::Val);
529 t.insert(*b"GTG", AminoAcid::Val);
530
531 t.insert(*b"GCT", AminoAcid::Ala);
533 t.insert(*b"GCC", AminoAcid::Ala);
534 t.insert(*b"GCA", AminoAcid::Ala);
535 t.insert(*b"GCG", AminoAcid::Ala);
536
537 t.insert(*b"GAT", AminoAcid::Asp);
539 t.insert(*b"GAC", AminoAcid::Asp);
540 t.insert(*b"GAA", AminoAcid::Glu);
541 t.insert(*b"GAG", AminoAcid::Glu);
542
543 t.insert(*b"GGT", AminoAcid::Gly);
545 t.insert(*b"GGC", AminoAcid::Gly);
546 t.insert(*b"GGA", AminoAcid::Gly);
547 t.insert(*b"GGG", AminoAcid::Gly);
548
549 t
550}
551
552#[cfg(test)]
553mod tests {
554 use super::*;
555
556 #[test]
557 fn test_nucleotide_sequence_valid_dna() {
558 let seq = NucleotideSequence::new(b"ATGCATGC", SequenceType::Dna);
559 assert!(seq.is_ok());
560 let s = seq.expect("sequence creation failed");
561 assert_eq!(s.len(), 8);
562 assert_eq!(s.seq_type(), SequenceType::Dna);
563 }
564
565 #[test]
566 fn test_nucleotide_sequence_invalid_dna() {
567 let result = NucleotideSequence::new(b"AUGC", SequenceType::Dna);
569 assert!(result.is_err());
570 }
571
572 #[test]
573 fn test_nucleotide_sequence_valid_rna() {
574 let seq = NucleotideSequence::new(b"AUGCAUGC", SequenceType::Rna);
575 assert!(seq.is_ok());
576 }
577
578 #[test]
579 fn test_complement_basic() {
580 assert_eq!(complement(b"ATGC"), b"TACG");
581 assert_eq!(complement(b"AAAA"), b"TTTT");
582 assert_eq!(complement(b"CCCC"), b"GGGG");
583 }
584
585 #[test]
586 fn test_reverse_complement() {
587 let rc = reverse_complement(b"ATGCTT");
589 assert_eq!(rc, b"AAGCAT");
590 }
591
592 #[test]
593 fn test_reverse_complement_palindrome() {
594 let rc = reverse_complement(b"GAATTC");
596 assert_eq!(rc, b"GAATTC");
597 }
598
599 #[test]
600 fn test_gc_content_half() {
601 let gc = gc_content(b"ATGCATGC");
602 assert!((gc - 0.5).abs() < 1e-10);
603 }
604
605 #[test]
606 fn test_gc_content_zero() {
607 let gc = gc_content(b"AAAA");
608 assert!((gc - 0.0).abs() < 1e-10);
609 }
610
611 #[test]
612 fn test_gc_content_one() {
613 let gc = gc_content(b"GCGC");
614 assert!((gc - 1.0).abs() < 1e-10);
615 }
616
617 #[test]
618 fn test_gc_content_empty() {
619 assert_eq!(gc_content(b""), 0.0);
620 }
621
622 #[test]
623 fn test_translate_met_stop() {
624 let protein = translate(b"ATGAAATAA").expect("translation failed");
625 assert_eq!(
626 protein,
627 vec![AminoAcid::Met, AminoAcid::Lys, AminoAcid::Stop]
628 );
629 }
630
631 #[test]
632 fn test_translate_stops_at_first_stop() {
633 let protein = translate(b"ATGTAAAAAA").expect("translation failed");
635 assert_eq!(protein, vec![AminoAcid::Met, AminoAcid::Stop]);
636 }
637
638 #[test]
639 fn test_translate_incomplete_codon_ignored() {
640 let protein = translate(b"ATGAAAT").expect("translation failed");
642 assert_eq!(protein, vec![AminoAcid::Met, AminoAcid::Lys]);
643 }
644
645 #[test]
646 fn test_kmer_count_basic() {
647 let counts = kmer_count(b"ATAT", 2).expect("kmer_count failed");
648 assert_eq!(*counts.get(b"AT".as_ref()).expect("AT not found"), 2);
649 assert_eq!(*counts.get(b"TA".as_ref()).expect("TA not found"), 1);
650 }
651
652 #[test]
653 fn test_kmer_count_k_equals_length() {
654 let counts = kmer_count(b"ATGC", 4).expect("kmer_count failed");
655 assert_eq!(counts.len(), 1);
656 assert_eq!(*counts.get(b"ATGC".as_ref()).expect("ATGC not found"), 1);
657 }
658
659 #[test]
660 fn test_kmer_count_k_zero_errors() {
661 let result = kmer_count(b"ATGC", 0);
662 assert!(result.is_err());
663 }
664
665 #[test]
666 fn test_kmer_count_k_too_large_errors() {
667 let result = kmer_count(b"AT", 5);
668 assert!(result.is_err());
669 }
670
671 #[test]
672 fn test_amino_acid_one_letter() {
673 assert_eq!(AminoAcid::Met.one_letter(), 'M');
674 assert_eq!(AminoAcid::Stop.one_letter(), '*');
675 }
676
677 #[test]
678 fn test_codon_table_size() {
679 let table = build_codon_table();
680 assert_eq!(table.len(), 64);
682 }
683
684 #[test]
685 fn test_lowercase_input_normalised() {
686 let seq = NucleotideSequence::new(b"atgc", SequenceType::Dna).expect("should succeed");
688 assert_eq!(seq.as_bytes(), b"ATGC");
689 }
690}