use std::collections::HashMap;
use std::fmt;
use crate::error::{CoreError, CoreResult};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum AminoAcid {
Ala,
Arg,
Asn,
Asp,
Cys,
Gln,
Glu,
Gly,
His,
Ile,
Leu,
Lys,
Met,
Phe,
Pro,
Ser,
Thr,
Trp,
Tyr,
Val,
Stop,
}
impl AminoAcid {
#[must_use]
pub fn one_letter(&self) -> char {
match self {
AminoAcid::Ala => 'A',
AminoAcid::Arg => 'R',
AminoAcid::Asn => 'N',
AminoAcid::Asp => 'D',
AminoAcid::Cys => 'C',
AminoAcid::Gln => 'Q',
AminoAcid::Glu => 'E',
AminoAcid::Gly => 'G',
AminoAcid::His => 'H',
AminoAcid::Ile => 'I',
AminoAcid::Leu => 'L',
AminoAcid::Lys => 'K',
AminoAcid::Met => 'M',
AminoAcid::Phe => 'F',
AminoAcid::Pro => 'P',
AminoAcid::Ser => 'S',
AminoAcid::Thr => 'T',
AminoAcid::Trp => 'W',
AminoAcid::Tyr => 'Y',
AminoAcid::Val => 'V',
AminoAcid::Stop => '*',
}
}
#[must_use]
pub fn three_letter(&self) -> &'static str {
match self {
AminoAcid::Ala => "Ala",
AminoAcid::Arg => "Arg",
AminoAcid::Asn => "Asn",
AminoAcid::Asp => "Asp",
AminoAcid::Cys => "Cys",
AminoAcid::Gln => "Gln",
AminoAcid::Glu => "Glu",
AminoAcid::Gly => "Gly",
AminoAcid::His => "His",
AminoAcid::Ile => "Ile",
AminoAcid::Leu => "Leu",
AminoAcid::Lys => "Lys",
AminoAcid::Met => "Met",
AminoAcid::Phe => "Phe",
AminoAcid::Pro => "Pro",
AminoAcid::Ser => "Ser",
AminoAcid::Thr => "Thr",
AminoAcid::Trp => "Trp",
AminoAcid::Tyr => "Tyr",
AminoAcid::Val => "Val",
AminoAcid::Stop => "Stop",
}
}
#[must_use]
pub fn is_stop(&self) -> bool {
matches!(self, AminoAcid::Stop)
}
}
impl fmt::Display for AminoAcid {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.one_letter())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SequenceType {
Dna,
Rna,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NucleotideSequence {
data: Vec<u8>,
seq_type: SequenceType,
}
impl NucleotideSequence {
pub fn new(seq: &[u8], seq_type: SequenceType) -> CoreResult<Self> {
let data: Vec<u8> = seq.iter().map(|&b| b.to_ascii_uppercase()).collect();
for (i, &base) in data.iter().enumerate() {
if !is_valid_base(base, seq_type) {
return Err(CoreError::ValueError(crate::error_context!(format!(
"Invalid {} base '{}' at position {}",
match seq_type {
SequenceType::Dna => "DNA",
SequenceType::Rna => "RNA",
},
base as char,
i
))));
}
}
Ok(Self { data, seq_type })
}
#[must_use]
pub fn as_bytes(&self) -> &[u8] {
&self.data
}
#[must_use]
pub fn len(&self) -> usize {
self.data.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
#[must_use]
pub fn seq_type(&self) -> SequenceType {
self.seq_type
}
#[must_use]
pub fn gc_content(&self) -> f64 {
gc_content(&self.data)
}
pub fn reverse_complement(&self) -> CoreResult<NucleotideSequence> {
if self.seq_type != SequenceType::Dna {
return Err(CoreError::ValueError(crate::error_context!(
"reverse_complement is only defined for DNA sequences"
)));
}
let rc = reverse_complement(&self.data);
Ok(NucleotideSequence {
data: rc,
seq_type: SequenceType::Dna,
})
}
pub fn translate(&self) -> CoreResult<Vec<AminoAcid>> {
if self.seq_type != SequenceType::Dna {
return Err(CoreError::ValueError(crate::error_context!(
"translate is only defined for DNA sequences"
)));
}
translate(&self.data)
}
pub fn kmer_count(&self, k: usize) -> CoreResult<HashMap<Vec<u8>, usize>> {
kmer_count(&self.data, k)
}
}
impl fmt::Display for NucleotideSequence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = std::str::from_utf8(&self.data).map_err(|_| fmt::Error)?;
write!(f, "{s}")
}
}
#[must_use]
pub fn is_valid_base(base: u8, seq_type: SequenceType) -> bool {
match seq_type {
SequenceType::Dna => matches!(base, b'A' | b'T' | b'G' | b'C' | b'N'),
SequenceType::Rna => matches!(base, b'A' | b'U' | b'G' | b'C' | b'N'),
}
}
#[must_use]
pub fn complement_base(base: u8) -> u8 {
match base.to_ascii_uppercase() {
b'A' => b'T',
b'T' => b'A',
b'G' => b'C',
b'C' => b'G',
other => other,
}
}
#[must_use]
pub fn complement(seq: &[u8]) -> Vec<u8> {
seq.iter().map(|&b| complement_base(b)).collect()
}
#[must_use]
pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter().rev().map(|&b| complement_base(b)).collect()
}
#[must_use]
pub fn gc_content(seq: &[u8]) -> f64 {
if seq.is_empty() {
return 0.0;
}
let gc = seq
.iter()
.filter(|&&b| {
let ub = b.to_ascii_uppercase();
ub == b'G' || ub == b'C'
})
.count();
gc as f64 / seq.len() as f64
}
pub fn translate(dna: &[u8]) -> CoreResult<Vec<AminoAcid>> {
let codon_table = build_codon_table();
let mut protein = Vec::new();
let upper: Vec<u8> = dna.iter().map(|&b| b.to_ascii_uppercase()).collect();
let mut i = 0;
while i + 3 <= upper.len() {
let codon: [u8; 3] = [upper[i], upper[i + 1], upper[i + 2]];
let aa = codon_table.get(&codon).copied().ok_or_else(|| {
CoreError::ValueError(crate::error_context!(format!(
"Unknown codon: {}{}{}",
codon[0] as char, codon[1] as char, codon[2] as char
)))
})?;
protein.push(aa);
if aa == AminoAcid::Stop {
break;
}
i += 3;
}
Ok(protein)
}
pub fn kmer_count(seq: &[u8], k: usize) -> CoreResult<HashMap<Vec<u8>, usize>> {
if k == 0 {
return Err(CoreError::ValueError(crate::error_context!(
"k must be at least 1"
)));
}
if k > seq.len() {
return Err(CoreError::ValueError(crate::error_context!(format!(
"k ({k}) must not exceed sequence length ({})",
seq.len()
))));
}
let upper: Vec<u8> = seq.iter().map(|&b| b.to_ascii_uppercase()).collect();
let mut counts: HashMap<Vec<u8>, usize> = HashMap::new();
for window in upper.windows(k) {
*counts.entry(window.to_vec()).or_insert(0) += 1;
}
Ok(counts)
}
#[must_use]
pub fn build_codon_table() -> HashMap<[u8; 3], AminoAcid> {
let mut t = HashMap::new();
t.insert(*b"TTT", AminoAcid::Phe);
t.insert(*b"TTC", AminoAcid::Phe);
t.insert(*b"TTA", AminoAcid::Leu);
t.insert(*b"TTG", AminoAcid::Leu);
t.insert(*b"TCT", AminoAcid::Ser);
t.insert(*b"TCC", AminoAcid::Ser);
t.insert(*b"TCA", AminoAcid::Ser);
t.insert(*b"TCG", AminoAcid::Ser);
t.insert(*b"TAT", AminoAcid::Tyr);
t.insert(*b"TAC", AminoAcid::Tyr);
t.insert(*b"TAA", AminoAcid::Stop);
t.insert(*b"TAG", AminoAcid::Stop);
t.insert(*b"TGT", AminoAcid::Cys);
t.insert(*b"TGC", AminoAcid::Cys);
t.insert(*b"TGA", AminoAcid::Stop);
t.insert(*b"TGG", AminoAcid::Trp);
t.insert(*b"CTT", AminoAcid::Leu);
t.insert(*b"CTC", AminoAcid::Leu);
t.insert(*b"CTA", AminoAcid::Leu);
t.insert(*b"CTG", AminoAcid::Leu);
t.insert(*b"CCT", AminoAcid::Pro);
t.insert(*b"CCC", AminoAcid::Pro);
t.insert(*b"CCA", AminoAcid::Pro);
t.insert(*b"CCG", AminoAcid::Pro);
t.insert(*b"CAT", AminoAcid::His);
t.insert(*b"CAC", AminoAcid::His);
t.insert(*b"CAA", AminoAcid::Gln);
t.insert(*b"CAG", AminoAcid::Gln);
t.insert(*b"CGT", AminoAcid::Arg);
t.insert(*b"CGC", AminoAcid::Arg);
t.insert(*b"CGA", AminoAcid::Arg);
t.insert(*b"CGG", AminoAcid::Arg);
t.insert(*b"ATT", AminoAcid::Ile);
t.insert(*b"ATC", AminoAcid::Ile);
t.insert(*b"ATA", AminoAcid::Ile);
t.insert(*b"ATG", AminoAcid::Met);
t.insert(*b"ACT", AminoAcid::Thr);
t.insert(*b"ACC", AminoAcid::Thr);
t.insert(*b"ACA", AminoAcid::Thr);
t.insert(*b"ACG", AminoAcid::Thr);
t.insert(*b"AAT", AminoAcid::Asn);
t.insert(*b"AAC", AminoAcid::Asn);
t.insert(*b"AAA", AminoAcid::Lys);
t.insert(*b"AAG", AminoAcid::Lys);
t.insert(*b"AGT", AminoAcid::Ser);
t.insert(*b"AGC", AminoAcid::Ser);
t.insert(*b"AGA", AminoAcid::Arg);
t.insert(*b"AGG", AminoAcid::Arg);
t.insert(*b"GTT", AminoAcid::Val);
t.insert(*b"GTC", AminoAcid::Val);
t.insert(*b"GTA", AminoAcid::Val);
t.insert(*b"GTG", AminoAcid::Val);
t.insert(*b"GCT", AminoAcid::Ala);
t.insert(*b"GCC", AminoAcid::Ala);
t.insert(*b"GCA", AminoAcid::Ala);
t.insert(*b"GCG", AminoAcid::Ala);
t.insert(*b"GAT", AminoAcid::Asp);
t.insert(*b"GAC", AminoAcid::Asp);
t.insert(*b"GAA", AminoAcid::Glu);
t.insert(*b"GAG", AminoAcid::Glu);
t.insert(*b"GGT", AminoAcid::Gly);
t.insert(*b"GGC", AminoAcid::Gly);
t.insert(*b"GGA", AminoAcid::Gly);
t.insert(*b"GGG", AminoAcid::Gly);
t
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nucleotide_sequence_valid_dna() {
let seq = NucleotideSequence::new(b"ATGCATGC", SequenceType::Dna);
assert!(seq.is_ok());
let s = seq.expect("sequence creation failed");
assert_eq!(s.len(), 8);
assert_eq!(s.seq_type(), SequenceType::Dna);
}
#[test]
fn test_nucleotide_sequence_invalid_dna() {
let result = NucleotideSequence::new(b"AUGC", SequenceType::Dna);
assert!(result.is_err());
}
#[test]
fn test_nucleotide_sequence_valid_rna() {
let seq = NucleotideSequence::new(b"AUGCAUGC", SequenceType::Rna);
assert!(seq.is_ok());
}
#[test]
fn test_complement_basic() {
assert_eq!(complement(b"ATGC"), b"TACG");
assert_eq!(complement(b"AAAA"), b"TTTT");
assert_eq!(complement(b"CCCC"), b"GGGG");
}
#[test]
fn test_reverse_complement() {
let rc = reverse_complement(b"ATGCTT");
assert_eq!(rc, b"AAGCAT");
}
#[test]
fn test_reverse_complement_palindrome() {
let rc = reverse_complement(b"GAATTC");
assert_eq!(rc, b"GAATTC");
}
#[test]
fn test_gc_content_half() {
let gc = gc_content(b"ATGCATGC");
assert!((gc - 0.5).abs() < 1e-10);
}
#[test]
fn test_gc_content_zero() {
let gc = gc_content(b"AAAA");
assert!((gc - 0.0).abs() < 1e-10);
}
#[test]
fn test_gc_content_one() {
let gc = gc_content(b"GCGC");
assert!((gc - 1.0).abs() < 1e-10);
}
#[test]
fn test_gc_content_empty() {
assert_eq!(gc_content(b""), 0.0);
}
#[test]
fn test_translate_met_stop() {
let protein = translate(b"ATGAAATAA").expect("translation failed");
assert_eq!(
protein,
vec![AminoAcid::Met, AminoAcid::Lys, AminoAcid::Stop]
);
}
#[test]
fn test_translate_stops_at_first_stop() {
let protein = translate(b"ATGTAAAAAA").expect("translation failed");
assert_eq!(protein, vec![AminoAcid::Met, AminoAcid::Stop]);
}
#[test]
fn test_translate_incomplete_codon_ignored() {
let protein = translate(b"ATGAAAT").expect("translation failed");
assert_eq!(protein, vec![AminoAcid::Met, AminoAcid::Lys]);
}
#[test]
fn test_kmer_count_basic() {
let counts = kmer_count(b"ATAT", 2).expect("kmer_count failed");
assert_eq!(*counts.get(b"AT".as_ref()).expect("AT not found"), 2);
assert_eq!(*counts.get(b"TA".as_ref()).expect("TA not found"), 1);
}
#[test]
fn test_kmer_count_k_equals_length() {
let counts = kmer_count(b"ATGC", 4).expect("kmer_count failed");
assert_eq!(counts.len(), 1);
assert_eq!(*counts.get(b"ATGC".as_ref()).expect("ATGC not found"), 1);
}
#[test]
fn test_kmer_count_k_zero_errors() {
let result = kmer_count(b"ATGC", 0);
assert!(result.is_err());
}
#[test]
fn test_kmer_count_k_too_large_errors() {
let result = kmer_count(b"AT", 5);
assert!(result.is_err());
}
#[test]
fn test_amino_acid_one_letter() {
assert_eq!(AminoAcid::Met.one_letter(), 'M');
assert_eq!(AminoAcid::Stop.one_letter(), '*');
}
#[test]
fn test_codon_table_size() {
let table = build_codon_table();
assert_eq!(table.len(), 64);
}
#[test]
fn test_lowercase_input_normalised() {
let seq = NucleotideSequence::new(b"atgc", SequenceType::Dna).expect("should succeed");
assert_eq!(seq.as_bytes(), b"ATGC");
}
}