1use cyanea_core::Result;
8
9use crate::alphabet::{DnaAlphabet, ProteinAlphabet, RnaAlphabet};
10use crate::codon::{self, GeneticCode};
11use crate::kmer::KmerIter;
12use crate::seq::ValidatedSeq;
13
14pub type DnaSequence = ValidatedSeq<DnaAlphabet>;
16
17pub type RnaSequence = ValidatedSeq<RnaAlphabet>;
19
20pub type ProteinSequence = ValidatedSeq<ProteinAlphabet>;
22
23fn dna_complement(b: u8) -> u8 {
28 match b {
29 b'A' => b'T',
30 b'T' => b'A',
31 b'C' => b'G',
32 b'G' => b'C',
33 b'R' => b'Y', b'Y' => b'R',
35 b'S' => b'S', b'W' => b'W', b'K' => b'M', b'M' => b'K',
39 b'B' => b'V', b'V' => b'B',
41 b'D' => b'H', b'H' => b'D',
43 b'N' => b'N',
44 other => other,
45 }
46}
47
48fn rna_complement(b: u8) -> u8 {
49 match b {
50 b'A' => b'U',
51 b'U' => b'A',
52 b'C' => b'G',
53 b'G' => b'C',
54 b'R' => b'Y',
55 b'Y' => b'R',
56 b'S' => b'S',
57 b'W' => b'W',
58 b'K' => b'M',
59 b'M' => b'K',
60 b'B' => b'V',
61 b'V' => b'B',
62 b'D' => b'H',
63 b'H' => b'D',
64 b'N' => b'N',
65 other => other,
66 }
67}
68
69impl DnaSequence {
74 pub fn reverse_complement(&self) -> DnaSequence {
76 let rc: Vec<u8> = self.iter().rev().map(|&b| dna_complement(b)).collect();
77 DnaSequence::from_validated(rc)
78 }
79
80 pub fn transcribe(&self) -> RnaSequence {
82 let rna: Vec<u8> = self
83 .iter()
84 .map(|&b| if b == b'T' { b'U' } else { b })
85 .collect();
86 RnaSequence::from_validated(rna)
87 }
88
89 pub fn translate(&self) -> Result<ProteinSequence> {
91 self.transcribe().translate()
92 }
93
94 pub fn translate_with(&self, code: &GeneticCode) -> Result<ProteinSequence> {
96 let protein_bytes = code.translate_sequence(self.as_ref());
97 ProteinSequence::new(&protein_bytes)
98 }
99
100 pub fn gc_content(&self) -> f64 {
104 if self.is_empty() {
105 return 0.0;
106 }
107 let gc = self.iter().filter(|&&b| b == b'G' || b == b'C').count();
108 gc as f64 / self.len() as f64
109 }
110
111 pub fn kmers(&self, k: usize) -> Result<KmerIter<'_>> {
113 KmerIter::new(self, k)
114 }
115}
116
117impl RnaSequence {
122 pub fn reverse_complement(&self) -> RnaSequence {
124 let rc: Vec<u8> = self.iter().rev().map(|&b| rna_complement(b)).collect();
125 RnaSequence::from_validated(rc)
126 }
127
128 pub fn reverse_transcribe(&self) -> DnaSequence {
130 let dna: Vec<u8> = self
131 .iter()
132 .map(|&b| if b == b'U' { b'T' } else { b })
133 .collect();
134 DnaSequence::from_validated(dna)
135 }
136
137 pub fn translate(&self) -> Result<ProteinSequence> {
139 let protein_bytes = codon::translate_sequence(self.as_ref());
140 ProteinSequence::new(&protein_bytes)
141 }
142
143 pub fn translate_with(&self, code: &GeneticCode) -> Result<ProteinSequence> {
145 let protein_bytes = code.translate_sequence(self.as_ref());
146 ProteinSequence::new(&protein_bytes)
147 }
148
149 pub fn translate_frames(&self) -> [Result<ProteinSequence>; 3] {
151 let bytes: &[u8] = self.as_ref();
152 [
153 ProteinSequence::new(&codon::translate_sequence(bytes)),
154 ProteinSequence::new(&codon::translate_sequence(if bytes.len() > 1 {
155 &bytes[1..]
156 } else {
157 &[]
158 })),
159 ProteinSequence::new(&codon::translate_sequence(if bytes.len() > 2 {
160 &bytes[2..]
161 } else {
162 &[]
163 })),
164 ]
165 }
166
167 pub fn kmers(&self, k: usize) -> Result<KmerIter<'_>> {
169 KmerIter::new(self, k)
170 }
171}
172
173fn amino_acid_weight(aa: u8) -> f64 {
179 match aa {
180 b'A' => 89.09, b'R' => 174.20, b'N' => 132.12, b'D' => 133.10,
181 b'C' => 121.16, b'E' => 147.13, b'Q' => 146.15, b'G' => 75.03,
182 b'H' => 155.16, b'I' => 131.17, b'L' => 131.17, b'K' => 146.19,
183 b'M' => 149.21, b'F' => 165.19, b'P' => 115.13, b'S' => 105.09,
184 b'T' => 119.12, b'W' => 204.23, b'Y' => 181.19, b'V' => 117.15,
185 _ => 128.16,
187 }
188}
189
190impl ProteinSequence {
191 pub fn molecular_weight(&self) -> f64 {
195 if self.is_empty() {
196 return 0.0;
197 }
198 let sum: f64 = self.iter().map(|&aa| amino_acid_weight(aa)).sum();
199 let water = 18.015;
201 sum - (self.len() as f64 - 1.0) * water
202 }
203
204 pub fn kmers(&self, k: usize) -> Result<KmerIter<'_>> {
206 KmerIter::new(self, k)
207 }
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
217 fn revcomp_palindromic() {
218 let seq = DnaSequence::new(b"ACGT").unwrap();
219 assert_eq!(seq.reverse_complement().as_ref(), b"ACGT");
220 }
221
222 #[test]
223 fn revcomp_asymmetric() {
224 let seq = DnaSequence::new(b"AACG").unwrap();
225 assert_eq!(seq.reverse_complement().as_ref(), b"CGTT");
226 }
227
228 #[test]
229 fn revcomp_iupac_ambiguity() {
230 let seq = DnaSequence::new(b"RYSWKMBDHVN").unwrap();
231 let rc = seq.reverse_complement();
232 assert_eq!(rc.as_ref(), b"NBDHVKMWSRY");
233 }
234
235 #[test]
238 fn dna_to_rna() {
239 let dna = DnaSequence::new(b"ATCG").unwrap();
240 let rna = dna.transcribe();
241 assert_eq!(rna.as_ref(), b"AUCG");
242 }
243
244 #[test]
245 fn rna_to_dna() {
246 let rna = RnaSequence::new(b"AUCG").unwrap();
247 let dna = rna.reverse_transcribe();
248 assert_eq!(dna.as_ref(), b"ATCG");
249 }
250
251 #[test]
252 fn transcription_roundtrip() {
253 let dna = DnaSequence::new(b"ATCGATCG").unwrap();
254 let roundtrip = dna.transcribe().reverse_transcribe();
255 assert_eq!(dna, roundtrip);
256 }
257
258 #[test]
261 fn translate_aug_uuu_uaa() {
262 let rna = RnaSequence::new(b"AUGUUUUAA").unwrap();
264 let protein = rna.translate().unwrap();
265 assert_eq!(protein.as_ref(), b"MF");
266 }
267
268 #[test]
269 fn translate_stops_at_first_stop() {
270 let rna = RnaSequence::new(b"AUGUUUUAAGCU").unwrap();
271 let protein = rna.translate().unwrap();
272 assert_eq!(protein.as_ref(), b"MF");
273 }
274
275 #[test]
276 fn translate_incomplete_codon_ignored() {
277 let rna = RnaSequence::new(b"AUGUUUAU").unwrap();
278 let protein = rna.translate().unwrap();
279 assert_eq!(protein.as_ref(), b"MF");
280 }
281
282 #[test]
285 fn gc_content_basic() {
286 let seq = DnaSequence::new(b"ATGC").unwrap();
287 assert!((seq.gc_content() - 0.5).abs() < 1e-10);
288 }
289
290 #[test]
291 fn gc_content_empty() {
292 let seq = DnaSequence::new(b"").unwrap();
293 assert_eq!(seq.gc_content(), 0.0);
294 }
295}
296
297#[cfg(test)]
298mod proptests {
299 use super::*;
300 use proptest::prelude::*;
301
302 fn dna_string(max_len: usize) -> impl Strategy<Value = Vec<u8>> {
304 proptest::collection::vec(prop_oneof![Just(b'A'), Just(b'C'), Just(b'G'), Just(b'T')], 1..=max_len)
305 }
306
307 proptest! {
308 #[test]
309 fn transcribe_reverse_transcribe_roundtrips(seq in dna_string(500)) {
310 let dna = DnaSequence::new(&seq).unwrap();
311 let roundtrip = dna.transcribe().reverse_transcribe();
312 prop_assert_eq!(dna.as_ref() as &[u8], roundtrip.as_ref() as &[u8]);
313 }
314
315 #[test]
316 fn gc_content_in_unit_interval(seq in dna_string(500)) {
317 let dna = DnaSequence::new(&seq).unwrap();
318 let gc = dna.gc_content();
319 prop_assert!(gc >= 0.0 && gc <= 1.0, "gc_content={} out of [0,1]", gc);
320 }
321
322 #[test]
323 fn reverse_complement_involution(seq in dna_string(500)) {
324 let dna = DnaSequence::new(&seq).unwrap();
325 let double_rc = dna.reverse_complement().reverse_complement();
326 prop_assert_eq!(dna.as_ref() as &[u8], double_rc.as_ref() as &[u8]);
327 }
328
329 #[test]
330 fn reverse_complement_preserves_length(seq in dna_string(500)) {
331 let dna = DnaSequence::new(&seq).unwrap();
332 prop_assert_eq!(dna.len(), dna.reverse_complement().len());
333 }
334 }
335}