Skip to main content

cyanea_seq/
types.rs

1//! Concrete sequence type aliases and biologically meaningful operations.
2//!
3//! - [`DnaSequence`] — reverse complement, transcription, translation, GC content
4//! - [`RnaSequence`] — reverse complement, reverse transcription, translation
5//! - [`ProteinSequence`] — molecular weight
6
7use cyanea_core::Result;
8
9use crate::alphabet::{DnaAlphabet, ProteinAlphabet, RnaAlphabet};
10use crate::codon::{self, GeneticCode};
11use crate::kmer::KmerIter;
12use crate::seq::ValidatedSeq;
13
14/// A validated DNA sequence (IUPAC alphabet).
15pub type DnaSequence = ValidatedSeq<DnaAlphabet>;
16
17/// A validated RNA sequence (IUPAC alphabet).
18pub type RnaSequence = ValidatedSeq<RnaAlphabet>;
19
20/// A validated protein/amino acid sequence.
21pub type ProteinSequence = ValidatedSeq<ProteinAlphabet>;
22
23// ---------------------------------------------------------------------------
24// DNA complement table (full IUPAC)
25// ---------------------------------------------------------------------------
26
27fn dna_complement(b: u8) -> u8 {
28    match b {
29        b'A' => b'T',
30        b'T' => b'A',
31        b'C' => b'G',
32        b'G' => b'C',
33        b'R' => b'Y', // A|G → T|C
34        b'Y' => b'R',
35        b'S' => b'S', // G|C → C|G
36        b'W' => b'W', // A|T → T|A
37        b'K' => b'M', // G|T → C|A
38        b'M' => b'K',
39        b'B' => b'V', // C|G|T → G|C|A
40        b'V' => b'B',
41        b'D' => b'H', // A|G|T → T|C|A
42        b'H' => b'D',
43        b'N' => b'N',
44        other => other,
45    }
46}
47
48fn rna_complement(b: u8) -> u8 {
49    match b {
50        b'A' => b'U',
51        b'U' => b'A',
52        b'C' => b'G',
53        b'G' => b'C',
54        b'R' => b'Y',
55        b'Y' => b'R',
56        b'S' => b'S',
57        b'W' => b'W',
58        b'K' => b'M',
59        b'M' => b'K',
60        b'B' => b'V',
61        b'V' => b'B',
62        b'D' => b'H',
63        b'H' => b'D',
64        b'N' => b'N',
65        other => other,
66    }
67}
68
69// ---------------------------------------------------------------------------
70// DNA methods
71// ---------------------------------------------------------------------------
72
73impl DnaSequence {
74    /// Return the reverse complement.
75    pub fn reverse_complement(&self) -> DnaSequence {
76        let rc: Vec<u8> = self.iter().rev().map(|&b| dna_complement(b)).collect();
77        DnaSequence::from_validated(rc)
78    }
79
80    /// Transcribe DNA to RNA (T → U).
81    pub fn transcribe(&self) -> RnaSequence {
82        let rna: Vec<u8> = self
83            .iter()
84            .map(|&b| if b == b'T' { b'U' } else { b })
85            .collect();
86        RnaSequence::from_validated(rna)
87    }
88
89    /// Translate DNA to protein (transcribes first, then translates).
90    pub fn translate(&self) -> Result<ProteinSequence> {
91        self.transcribe().translate()
92    }
93
94    /// Translate DNA to protein using a specific genetic code table.
95    pub fn translate_with(&self, code: &GeneticCode) -> Result<ProteinSequence> {
96        let protein_bytes = code.translate_sequence(self.as_ref());
97        ProteinSequence::new(&protein_bytes)
98    }
99
100    /// GC content as a fraction in [0.0, 1.0].
101    ///
102    /// Only counts unambiguous G and C bases. Returns 0.0 for empty sequences.
103    pub fn gc_content(&self) -> f64 {
104        if self.is_empty() {
105            return 0.0;
106        }
107        let gc = self.iter().filter(|&&b| b == b'G' || b == b'C').count();
108        gc as f64 / self.len() as f64
109    }
110
111    /// Iterate over k-mers of length `k`.
112    pub fn kmers(&self, k: usize) -> Result<KmerIter<'_>> {
113        KmerIter::new(self, k)
114    }
115}
116
117// ---------------------------------------------------------------------------
118// RNA methods
119// ---------------------------------------------------------------------------
120
121impl RnaSequence {
122    /// Return the reverse complement.
123    pub fn reverse_complement(&self) -> RnaSequence {
124        let rc: Vec<u8> = self.iter().rev().map(|&b| rna_complement(b)).collect();
125        RnaSequence::from_validated(rc)
126    }
127
128    /// Reverse-transcribe RNA to DNA (U → T).
129    pub fn reverse_transcribe(&self) -> DnaSequence {
130        let dna: Vec<u8> = self
131            .iter()
132            .map(|&b| if b == b'U' { b'T' } else { b })
133            .collect();
134        DnaSequence::from_validated(dna)
135    }
136
137    /// Translate RNA to protein using the standard genetic code.
138    pub fn translate(&self) -> Result<ProteinSequence> {
139        let protein_bytes = codon::translate_sequence(self.as_ref());
140        ProteinSequence::new(&protein_bytes)
141    }
142
143    /// Translate RNA to protein using a specific genetic code table.
144    pub fn translate_with(&self, code: &GeneticCode) -> Result<ProteinSequence> {
145        let protein_bytes = code.translate_sequence(self.as_ref());
146        ProteinSequence::new(&protein_bytes)
147    }
148
149    /// Translate in all three forward reading frames.
150    pub fn translate_frames(&self) -> [Result<ProteinSequence>; 3] {
151        let bytes: &[u8] = self.as_ref();
152        [
153            ProteinSequence::new(&codon::translate_sequence(bytes)),
154            ProteinSequence::new(&codon::translate_sequence(if bytes.len() > 1 {
155                &bytes[1..]
156            } else {
157                &[]
158            })),
159            ProteinSequence::new(&codon::translate_sequence(if bytes.len() > 2 {
160                &bytes[2..]
161            } else {
162                &[]
163            })),
164        ]
165    }
166
167    /// Iterate over k-mers of length `k`.
168    pub fn kmers(&self, k: usize) -> Result<KmerIter<'_>> {
169        KmerIter::new(self, k)
170    }
171}
172
173// ---------------------------------------------------------------------------
174// Protein methods
175// ---------------------------------------------------------------------------
176
177/// Average molecular weights (Da) for each amino acid.
178fn amino_acid_weight(aa: u8) -> f64 {
179    match aa {
180        b'A' => 89.09, b'R' => 174.20, b'N' => 132.12, b'D' => 133.10,
181        b'C' => 121.16, b'E' => 147.13, b'Q' => 146.15, b'G' => 75.03,
182        b'H' => 155.16, b'I' => 131.17, b'L' => 131.17, b'K' => 146.19,
183        b'M' => 149.21, b'F' => 165.19, b'P' => 115.13, b'S' => 105.09,
184        b'T' => 119.12, b'W' => 204.23, b'Y' => 181.19, b'V' => 117.15,
185        // Ambiguous / non-standard — use average of all 20
186        _ => 128.16,
187    }
188}
189
190impl ProteinSequence {
191    /// Estimated molecular weight in Daltons.
192    ///
193    /// Sum of residue weights minus (n-1) water molecules lost in peptide bonds.
194    pub fn molecular_weight(&self) -> f64 {
195        if self.is_empty() {
196            return 0.0;
197        }
198        let sum: f64 = self.iter().map(|&aa| amino_acid_weight(aa)).sum();
199        // Subtract water lost per peptide bond
200        let water = 18.015;
201        sum - (self.len() as f64 - 1.0) * water
202    }
203
204    /// Iterate over k-mers of length `k`.
205    pub fn kmers(&self, k: usize) -> Result<KmerIter<'_>> {
206        KmerIter::new(self, k)
207    }
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    // --- Reverse complement ---
215
216    #[test]
217    fn revcomp_palindromic() {
218        let seq = DnaSequence::new(b"ACGT").unwrap();
219        assert_eq!(seq.reverse_complement().as_ref(), b"ACGT");
220    }
221
222    #[test]
223    fn revcomp_asymmetric() {
224        let seq = DnaSequence::new(b"AACG").unwrap();
225        assert_eq!(seq.reverse_complement().as_ref(), b"CGTT");
226    }
227
228    #[test]
229    fn revcomp_iupac_ambiguity() {
230        let seq = DnaSequence::new(b"RYSWKMBDHVN").unwrap();
231        let rc = seq.reverse_complement();
232        assert_eq!(rc.as_ref(), b"NBDHVKMWSRY");
233    }
234
235    // --- Transcription ---
236
237    #[test]
238    fn dna_to_rna() {
239        let dna = DnaSequence::new(b"ATCG").unwrap();
240        let rna = dna.transcribe();
241        assert_eq!(rna.as_ref(), b"AUCG");
242    }
243
244    #[test]
245    fn rna_to_dna() {
246        let rna = RnaSequence::new(b"AUCG").unwrap();
247        let dna = rna.reverse_transcribe();
248        assert_eq!(dna.as_ref(), b"ATCG");
249    }
250
251    #[test]
252    fn transcription_roundtrip() {
253        let dna = DnaSequence::new(b"ATCGATCG").unwrap();
254        let roundtrip = dna.transcribe().reverse_transcribe();
255        assert_eq!(dna, roundtrip);
256    }
257
258    // --- Translation ---
259
260    #[test]
261    fn translate_aug_uuu_uaa() {
262        // AUG=M, UUU=F, UAA=stop → "MF"
263        let rna = RnaSequence::new(b"AUGUUUUAA").unwrap();
264        let protein = rna.translate().unwrap();
265        assert_eq!(protein.as_ref(), b"MF");
266    }
267
268    #[test]
269    fn translate_stops_at_first_stop() {
270        let rna = RnaSequence::new(b"AUGUUUUAAGCU").unwrap();
271        let protein = rna.translate().unwrap();
272        assert_eq!(protein.as_ref(), b"MF");
273    }
274
275    #[test]
276    fn translate_incomplete_codon_ignored() {
277        let rna = RnaSequence::new(b"AUGUUUAU").unwrap();
278        let protein = rna.translate().unwrap();
279        assert_eq!(protein.as_ref(), b"MF");
280    }
281
282    // --- GC content ---
283
284    #[test]
285    fn gc_content_basic() {
286        let seq = DnaSequence::new(b"ATGC").unwrap();
287        assert!((seq.gc_content() - 0.5).abs() < 1e-10);
288    }
289
290    #[test]
291    fn gc_content_empty() {
292        let seq = DnaSequence::new(b"").unwrap();
293        assert_eq!(seq.gc_content(), 0.0);
294    }
295}
296
297#[cfg(test)]
298mod proptests {
299    use super::*;
300    use proptest::prelude::*;
301
302    /// Strategy: generate a random DNA string of length 1..=500 using only ACGT
303    fn dna_string(max_len: usize) -> impl Strategy<Value = Vec<u8>> {
304        proptest::collection::vec(prop_oneof![Just(b'A'), Just(b'C'), Just(b'G'), Just(b'T')], 1..=max_len)
305    }
306
307    proptest! {
308        #[test]
309        fn transcribe_reverse_transcribe_roundtrips(seq in dna_string(500)) {
310            let dna = DnaSequence::new(&seq).unwrap();
311            let roundtrip = dna.transcribe().reverse_transcribe();
312            prop_assert_eq!(dna.as_ref() as &[u8], roundtrip.as_ref() as &[u8]);
313        }
314
315        #[test]
316        fn gc_content_in_unit_interval(seq in dna_string(500)) {
317            let dna = DnaSequence::new(&seq).unwrap();
318            let gc = dna.gc_content();
319            prop_assert!(gc >= 0.0 && gc <= 1.0, "gc_content={} out of [0,1]", gc);
320        }
321
322        #[test]
323        fn reverse_complement_involution(seq in dna_string(500)) {
324            let dna = DnaSequence::new(&seq).unwrap();
325            let double_rc = dna.reverse_complement().reverse_complement();
326            prop_assert_eq!(dna.as_ref() as &[u8], double_rc.as_ref() as &[u8]);
327        }
328
329        #[test]
330        fn reverse_complement_preserves_length(seq in dna_string(500)) {
331            let dna = DnaSequence::new(&seq).unwrap();
332            prop_assert_eq!(dna.len(), dna.reverse_complement().len());
333        }
334    }
335}