Skip to main content

cyanea_seq/
codon.rs

1//! Codon translation, genetic codes, codon usage analysis, and CAI.
2//!
3//! Supports 7 NCBI genetic code tables (1, 2, 3, 4, 5, 6, 11) with
4//! full translation, start/stop codon queries, codon usage frequency
5//! tables, RSCU, CAI, and synonymous/non-synonymous site counting.
6
7use cyanea_core::{CyaneaError, Result};
8
9// ---------------------------------------------------------------------------
10// Base encoding: A=0, C=1, G=2, T/U=3
11// ---------------------------------------------------------------------------
12
13fn base_index(b: u8) -> Option<usize> {
14    match b.to_ascii_uppercase() {
15        b'A' => Some(0),
16        b'C' => Some(1),
17        b'G' => Some(2),
18        b'T' | b'U' => Some(3),
19        _ => None,
20    }
21}
22
23/// Convert a 3-base codon to an index in [0, 64).
24fn codon_index(codon: &[u8]) -> Option<usize> {
25    if codon.len() != 3 {
26        return None;
27    }
28    let b1 = base_index(codon[0])?;
29    let b2 = base_index(codon[1])?;
30    let b3 = base_index(codon[2])?;
31    Some(b1 * 16 + b2 * 4 + b3)
32}
33
34/// Convert an index in [0, 64) back to a codon (as DNA: A/C/G/T).
35fn index_to_codon(idx: usize) -> [u8; 3] {
36    const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
37    [
38        BASES[idx >> 4],
39        BASES[(idx >> 2) & 3],
40        BASES[idx & 3],
41    ]
42}
43
44// ---------------------------------------------------------------------------
45// Genetic code table identifier
46// ---------------------------------------------------------------------------
47
48/// NCBI genetic code table identifier.
49///
50/// Covers the 7 most commonly used tables.
51#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
52#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
53pub enum GeneticCodeId {
54    Standard = 1,
55    VertebrateMitochondrial = 2,
56    YeastMitochondrial = 3,
57    MycoplasmaSpiroplasma = 4,
58    InvertebrateMitochondrial = 5,
59    CiliateNuclear = 6,
60    BacterialPlastid = 11,
61}
62
63// ---------------------------------------------------------------------------
64// Genetic code tables (const arrays)
65// ---------------------------------------------------------------------------
66
67// Codon order: AAA, AAC, AAG, AAT, ACA, ACC, ACG, ACT, AGA, AGC, AGG, AGT,
68//              ATA, ATC, ATG, ATT, CAA, CAC, CAG, CAT, CCA, CCC, CCG, CCT,
69//              CGA, CGC, CGG, CGT, CTA, CTC, CTG, CTT, GAA, GAC, GAG, GAT,
70//              GCA, GCC, GCG, GCT, GGA, GGC, GGG, GGT, GTA, GTC, GTG, GTT,
71//              TAA, TAC, TAG, TAT, TCA, TCC, TCG, TCT, TGA, TGC, TGG, TGT,
72//              TTA, TTC, TTG, TTT
73
74/// Standard genetic code (NCBI Table 1).
75const TABLE1_AA: [u8; 64] = [
76    b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'R', b'S', b'R', b'S',
77    b'I', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P',
78    b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L', b'E', b'D', b'E', b'D',
79    b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
80    b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'*', b'C', b'W', b'C',
81    b'L', b'F', b'L', b'F',
82];
83
84const TABLE1_STARTS: [bool; 64] = {
85    let mut s = [false; 64];
86    // ATG
87    s[14] = true;
88    s
89};
90
91/// Vertebrate mitochondrial (NCBI Table 2).
92/// UGA=Trp, AGA=Stop, AGG=Stop, AUA=Met
93const TABLE2_AA: [u8; 64] = [
94    b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'*', b'S', b'*', b'S',
95    b'M', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P',
96    b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L', b'E', b'D', b'E', b'D',
97    b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
98    b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'W', b'C', b'W', b'C',
99    b'L', b'F', b'L', b'F',
100];
101
102const TABLE2_STARTS: [bool; 64] = {
103    let mut s = [false; 64];
104    s[14] = true; // ATG
105    s[12] = true; // ATA
106    s[13] = true; // ATC
107    s[15] = true; // ATT
108    s[46] = true; // GTG
109    s
110};
111
112/// Yeast mitochondrial (NCBI Table 3).
113/// CUG=Thr (not Leu), UGA=Trp, CUA=Thr (not Leu)
114const TABLE3_AA: [u8; 64] = [
115    b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'R', b'S', b'R', b'S',
116    b'M', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P',
117    b'R', b'R', b'R', b'R', b'T', b'L', b'T', b'L', b'E', b'D', b'E', b'D',
118    b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
119    b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'W', b'C', b'W', b'C',
120    b'L', b'F', b'L', b'F',
121];
122
123const TABLE3_STARTS: [bool; 64] = {
124    let mut s = [false; 64];
125    s[14] = true; // ATG
126    s[12] = true; // ATA
127    s
128};
129
130/// Mycoplasma/Spiroplasma (NCBI Table 4).
131/// UGA=Trp
132const TABLE4_AA: [u8; 64] = [
133    b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'R', b'S', b'R', b'S',
134    b'I', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P',
135    b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L', b'E', b'D', b'E', b'D',
136    b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
137    b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'W', b'C', b'W', b'C',
138    b'L', b'F', b'L', b'F',
139];
140
141const TABLE4_STARTS: [bool; 64] = {
142    let mut s = [false; 64];
143    s[14] = true; // ATG
144    s[62] = true; // TTG
145    s[46] = true; // GTG
146    s
147};
148
149/// Invertebrate mitochondrial (NCBI Table 5).
150/// AGA=Ser, AGG=Ser, UGA=Trp, AUA=Met
151const TABLE5_AA: [u8; 64] = [
152    b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'S', b'S', b'S', b'S',
153    b'M', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P',
154    b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L', b'E', b'D', b'E', b'D',
155    b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
156    b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'W', b'C', b'W', b'C',
157    b'L', b'F', b'L', b'F',
158];
159
160const TABLE5_STARTS: [bool; 64] = {
161    let mut s = [false; 64];
162    s[14] = true; // ATG
163    s[15] = true; // ATT
164    s[46] = true; // GTG
165    s
166};
167
168/// Ciliate nuclear (NCBI Table 6).
169/// UAA=Gln, UAG=Gln (not Stop)
170const TABLE6_AA: [u8; 64] = [
171    b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'R', b'S', b'R', b'S',
172    b'I', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P',
173    b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L', b'E', b'D', b'E', b'D',
174    b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
175    b'Q', b'Y', b'Q', b'Y', b'S', b'S', b'S', b'S', b'*', b'C', b'W', b'C',
176    b'L', b'F', b'L', b'F',
177];
178
179const TABLE6_STARTS: [bool; 64] = {
180    let mut s = [false; 64];
181    s[14] = true; // ATG
182    s
183};
184
185/// Bacterial/plant plastid (NCBI Table 11).
186/// Same AAs as standard, different starts.
187const TABLE11_AA: [u8; 64] = TABLE1_AA;
188
189const TABLE11_STARTS: [bool; 64] = {
190    let mut s = [false; 64];
191    s[14] = true; // ATG
192    s[62] = true; // TTG
193    s[46] = true; // GTG
194    s[13] = true; // ATC
195    s[15] = true; // ATT
196    s[28] = true; // CTA (CTG in some refs — use the common bacterial ones)
197    s
198};
199
200// ---------------------------------------------------------------------------
201// GeneticCode
202// ---------------------------------------------------------------------------
203
204/// A genetic code translation table.
205///
206/// Wraps a 64-element amino acid lookup array and a 64-element start codon mask.
207/// Use [`GeneticCode::from_id`] or [`GeneticCode::standard`] to create.
208#[derive(Debug, Clone)]
209pub struct GeneticCode {
210    id: GeneticCodeId,
211    name: &'static str,
212    table: [u8; 64],
213    starts: [bool; 64],
214}
215
216impl GeneticCode {
217    /// Create a genetic code table from an NCBI table identifier.
218    pub fn from_id(id: GeneticCodeId) -> Self {
219        match id {
220            GeneticCodeId::Standard => Self {
221                id,
222                name: "Standard",
223                table: TABLE1_AA,
224                starts: TABLE1_STARTS,
225            },
226            GeneticCodeId::VertebrateMitochondrial => Self {
227                id,
228                name: "Vertebrate Mitochondrial",
229                table: TABLE2_AA,
230                starts: TABLE2_STARTS,
231            },
232            GeneticCodeId::YeastMitochondrial => Self {
233                id,
234                name: "Yeast Mitochondrial",
235                table: TABLE3_AA,
236                starts: TABLE3_STARTS,
237            },
238            GeneticCodeId::MycoplasmaSpiroplasma => Self {
239                id,
240                name: "Mycoplasma/Spiroplasma",
241                table: TABLE4_AA,
242                starts: TABLE4_STARTS,
243            },
244            GeneticCodeId::InvertebrateMitochondrial => Self {
245                id,
246                name: "Invertebrate Mitochondrial",
247                table: TABLE5_AA,
248                starts: TABLE5_STARTS,
249            },
250            GeneticCodeId::CiliateNuclear => Self {
251                id,
252                name: "Ciliate Nuclear",
253                table: TABLE6_AA,
254                starts: TABLE6_STARTS,
255            },
256            GeneticCodeId::BacterialPlastid => Self {
257                id,
258                name: "Bacterial/Plant Plastid",
259                table: TABLE11_AA,
260                starts: TABLE11_STARTS,
261            },
262        }
263    }
264
265    /// Create the standard genetic code (NCBI Table 1).
266    pub fn standard() -> Self {
267        Self::from_id(GeneticCodeId::Standard)
268    }
269
270    /// Table identifier.
271    pub fn id(&self) -> GeneticCodeId {
272        self.id
273    }
274
275    /// Human-readable name.
276    pub fn name(&self) -> &str {
277        self.name
278    }
279
280    /// Translate a single codon (3-byte slice) to an amino acid.
281    ///
282    /// Returns `None` for stop codons (encoded as `b'*'` internally) and
283    /// for unrecognizable/ambiguous codons.
284    pub fn translate_codon(&self, codon: &[u8]) -> Option<u8> {
285        let idx = codon_index(codon)?;
286        let aa = self.table[idx];
287        if aa == b'*' {
288            None
289        } else {
290            Some(aa)
291        }
292    }
293
294    /// Translate a nucleotide sequence, stopping at the first stop codon.
295    ///
296    /// Incomplete trailing codons are ignored.
297    pub fn translate_sequence(&self, seq: &[u8]) -> Vec<u8> {
298        let mut protein = Vec::with_capacity(seq.len() / 3);
299        for codon in seq.chunks_exact(3) {
300            match self.translate_codon(codon) {
301                Some(aa) => protein.push(aa),
302                None => break,
303            }
304        }
305        protein
306    }
307
308    /// Translate a nucleotide sequence, including stop codons as `'*'`.
309    ///
310    /// Does not stop at stop codons — translates the entire sequence.
311    /// Incomplete trailing codons are ignored.
312    pub fn translate_sequence_full(&self, seq: &[u8]) -> Vec<u8> {
313        let mut protein = Vec::with_capacity(seq.len() / 3);
314        for codon in seq.chunks_exact(3) {
315            if let Some(idx) = codon_index(codon) {
316                protein.push(self.table[idx]);
317            }
318        }
319        protein
320    }
321
322    /// Check whether a codon is a start codon in this table.
323    pub fn is_start(&self, codon: &[u8]) -> bool {
324        codon_index(codon).map_or(false, |idx| self.starts[idx])
325    }
326
327    /// Check whether a codon is a stop codon in this table.
328    pub fn is_stop(&self, codon: &[u8]) -> bool {
329        codon_index(codon).map_or(false, |idx| self.table[idx] == b'*')
330    }
331
332    /// Return all stop codons for this table (as DNA).
333    pub fn stop_codons(&self) -> Vec<[u8; 3]> {
334        (0..64)
335            .filter(|&i| self.table[i] == b'*')
336            .map(index_to_codon)
337            .collect()
338    }
339
340    /// Return all start codons for this table (as DNA).
341    pub fn start_codons(&self) -> Vec<[u8; 3]> {
342        (0..64)
343            .filter(|&i| self.starts[i])
344            .map(index_to_codon)
345            .collect()
346    }
347
348    /// Get the amino acid for a codon index (0..64), including `b'*'` for stops.
349    fn aa_at(&self, idx: usize) -> u8 {
350        self.table[idx]
351    }
352}
353
354// ---------------------------------------------------------------------------
355// Backward-compatible free functions (delegate to standard code)
356// ---------------------------------------------------------------------------
357
358/// Translate a single codon using the standard genetic code (NCBI Table 1).
359///
360/// Accepts both DNA (T) and RNA (U) codons. Returns `None` for stop codons
361/// and unrecognized/ambiguous codons.
362pub fn translate_codon(codon: &[u8]) -> Option<u8> {
363    GeneticCode::standard().translate_codon(codon)
364}
365
366/// Translate a nucleotide sequence using the standard genetic code.
367///
368/// Stops at the first stop codon. Incomplete trailing codons are ignored.
369pub fn translate_sequence(seq: &[u8]) -> Vec<u8> {
370    GeneticCode::standard().translate_sequence(seq)
371}
372
373// ---------------------------------------------------------------------------
374// Codon usage
375// ---------------------------------------------------------------------------
376
377/// Codon usage frequency table.
378///
379/// Counts occurrences of each of the 64 codons in a coding sequence.
380#[derive(Debug, Clone)]
381pub struct CodonUsage {
382    counts: [u64; 64],
383    total: u64,
384}
385
386impl CodonUsage {
387    /// Build a codon usage table from a coding sequence.
388    ///
389    /// The sequence should be in-frame coding DNA/RNA. Non-standard bases
390    /// and incomplete trailing codons are skipped.
391    pub fn from_sequence(seq: &[u8]) -> Self {
392        let mut counts = [0u64; 64];
393        let mut total = 0u64;
394        for codon in seq.chunks_exact(3) {
395            if let Some(idx) = codon_index(codon) {
396                counts[idx] += 1;
397                total += 1;
398            }
399        }
400        CodonUsage { counts, total }
401    }
402
403    /// Merge another usage table into this one.
404    pub fn merge(&mut self, other: &CodonUsage) {
405        for i in 0..64 {
406            self.counts[i] += other.counts[i];
407        }
408        self.total += other.total;
409    }
410
411    /// Raw count for a codon.
412    pub fn count(&self, codon: &[u8]) -> u64 {
413        codon_index(codon).map_or(0, |idx| self.counts[idx])
414    }
415
416    /// Relative frequency of a codon (count / total codons).
417    pub fn frequency(&self, codon: &[u8]) -> f64 {
418        if self.total == 0 {
419            return 0.0;
420        }
421        self.count(codon) as f64 / self.total as f64
422    }
423
424    /// Relative Synonymous Codon Usage for a codon.
425    ///
426    /// RSCU = observed / expected, where expected = (total for AA) / (number of synonymous codons).
427    /// An RSCU of 1.0 means no bias.
428    pub fn rscu(&self, codon: &[u8], code: &GeneticCode) -> f64 {
429        let idx = match codon_index(codon) {
430            Some(i) => i,
431            None => return 0.0,
432        };
433        let aa = code.aa_at(idx);
434        if aa == b'*' {
435            return 0.0;
436        }
437
438        // Find all synonymous codons
439        let synonymous: Vec<usize> = (0..64)
440            .filter(|&i| code.aa_at(i) == aa)
441            .collect();
442        let n_syn = synonymous.len() as f64;
443        let total_syn: u64 = synonymous.iter().map(|&i| self.counts[i]).sum();
444
445        if total_syn == 0 {
446            return 0.0;
447        }
448
449        let observed = self.counts[idx] as f64;
450        let expected = total_syn as f64 / n_syn;
451        observed / expected
452    }
453
454    /// Compute the relative adaptiveness (w_i) for all 64 codons.
455    ///
456    /// For each amino acid, w_i = RSCU_i / max(RSCU for that AA).
457    /// Stop codons get w = 0.0.
458    pub fn relative_adaptiveness(&self, code: &GeneticCode) -> [f64; 64] {
459        let mut w = [0.0f64; 64];
460
461        // Group codons by amino acid
462        for aa in b"ACDEFGHIKLMNPQRSTVWY".iter() {
463            let synonymous: Vec<usize> = (0..64)
464                .filter(|&i| code.aa_at(i) == *aa)
465                .collect();
466            if synonymous.is_empty() {
467                continue;
468            }
469
470            let rscu_vals: Vec<f64> = synonymous
471                .iter()
472                .map(|&i| {
473                    let total_syn: u64 = synonymous.iter().map(|&j| self.counts[j]).sum();
474                    if total_syn == 0 {
475                        return 0.0;
476                    }
477                    let n_syn = synonymous.len() as f64;
478                    self.counts[i] as f64 / (total_syn as f64 / n_syn)
479                })
480                .collect();
481
482            let max_rscu = rscu_vals
483                .iter()
484                .cloned()
485                .fold(0.0f64, f64::max);
486
487            if max_rscu > 0.0 {
488                for (k, &idx) in synonymous.iter().enumerate() {
489                    w[idx] = rscu_vals[k] / max_rscu;
490                }
491            }
492        }
493
494        w
495    }
496}
497
498// ---------------------------------------------------------------------------
499// CAI
500// ---------------------------------------------------------------------------
501
502/// Compute the Codon Adaptation Index of a query sequence against a reference usage table.
503///
504/// CAI is the geometric mean of relative adaptiveness values for each codon.
505/// Returns a value in (0, 1]. Higher values indicate better adaptation.
506///
507/// # Errors
508///
509/// Returns an error if the query sequence has no valid sense codons.
510pub fn codon_adaptation_index(
511    query: &[u8],
512    reference: &CodonUsage,
513    code: &GeneticCode,
514) -> Result<f64> {
515    let w = reference.relative_adaptiveness(code);
516    let mut log_sum = 0.0f64;
517    let mut n = 0u64;
518
519    for codon in query.chunks_exact(3) {
520        if let Some(idx) = codon_index(codon) {
521            let aa = code.aa_at(idx);
522            if aa == b'*' {
523                continue; // skip stop codons
524            }
525            let wi = w[idx];
526            if wi > 0.0 {
527                log_sum += wi.ln();
528                n += 1;
529            }
530        }
531    }
532
533    if n == 0 {
534        return Err(CyaneaError::InvalidInput(
535            "no valid sense codons in query".into(),
536        ));
537    }
538
539    Ok((log_sum / n as f64).exp())
540}
541
542// ---------------------------------------------------------------------------
543// Synonymous / non-synonymous classification
544// ---------------------------------------------------------------------------
545
546/// Classification of a single-nucleotide substitution between two codons.
547#[derive(Debug, Clone, Copy, PartialEq, Eq)]
548pub enum SubstitutionClass {
549    /// Both codons encode the same amino acid.
550    Synonymous,
551    /// The codons encode different amino acids (neither is a stop).
552    NonSynonymous,
553    /// At least one codon is a stop codon.
554    StopInvolved,
555}
556
557/// Classify the substitution between two codons that differ at exactly one position.
558///
559/// If the codons are identical or differ at more than one position, the
560/// classification is based on the amino acid change (or lack thereof).
561pub fn classify_substitution(
562    codon1: &[u8],
563    codon2: &[u8],
564    code: &GeneticCode,
565) -> SubstitutionClass {
566    let idx1 = match codon_index(codon1) {
567        Some(i) => i,
568        None => return SubstitutionClass::StopInvolved,
569    };
570    let idx2 = match codon_index(codon2) {
571        Some(i) => i,
572        None => return SubstitutionClass::StopInvolved,
573    };
574
575    let aa1 = code.aa_at(idx1);
576    let aa2 = code.aa_at(idx2);
577
578    if aa1 == b'*' || aa2 == b'*' {
579        SubstitutionClass::StopInvolved
580    } else if aa1 == aa2 {
581        SubstitutionClass::Synonymous
582    } else {
583        SubstitutionClass::NonSynonymous
584    }
585}
586
587/// Count the number of synonymous and non-synonymous sites in a coding sequence.
588///
589/// Uses the Nei-Gojobori method: for each codon, consider all possible
590/// single-nucleotide changes and classify as synonymous or non-synonymous.
591/// Each codon contributes 3 sites total.
592pub fn count_syn_nonsyn_sites(seq: &[u8], code: &GeneticCode) -> (f64, f64) {
593    let mut syn = 0.0f64;
594    let mut nonsyn = 0.0f64;
595    let bases: [u8; 4] = [b'A', b'C', b'G', b'T'];
596
597    for codon in seq.chunks_exact(3) {
598        let idx = match codon_index(codon) {
599            Some(i) => i,
600            None => continue,
601        };
602        let aa = code.aa_at(idx);
603        if aa == b'*' {
604            continue;
605        }
606
607        // For each position in the codon
608        for pos in 0..3 {
609            let mut s = 0.0f64;
610            let mut n = 0.0f64;
611            let orig = codon[pos].to_ascii_uppercase();
612            let orig = if orig == b'U' { b'T' } else { orig };
613
614            for &base in &bases {
615                if base == orig {
616                    continue;
617                }
618                let mut mutant = [codon[0], codon[1], codon[2]];
619                mutant[pos] = base;
620                let midx = match codon_index(&mutant) {
621                    Some(i) => i,
622                    None => continue,
623                };
624                let maa = code.aa_at(midx);
625                if maa == b'*' {
626                    // Stop mutation — count as non-synonymous
627                    n += 1.0;
628                } else if maa == aa {
629                    s += 1.0;
630                } else {
631                    n += 1.0;
632                }
633            }
634
635            let total = s + n;
636            if total > 0.0 {
637                syn += s / total;
638                nonsyn += n / total;
639            }
640        }
641    }
642
643    (syn, nonsyn)
644}
645
646// ---------------------------------------------------------------------------
647// Tests
648// ---------------------------------------------------------------------------
649
650#[cfg(test)]
651mod tests {
652    use super::*;
653
654    #[test]
655    fn standard_all_64_codons() {
656        let code = GeneticCode::standard();
657        // AAA=K, AAC=N, AAG=K, AAT=N
658        assert_eq!(code.translate_codon(b"AAA"), Some(b'K'));
659        assert_eq!(code.translate_codon(b"AAC"), Some(b'N'));
660        assert_eq!(code.translate_codon(b"AAG"), Some(b'K'));
661        assert_eq!(code.translate_codon(b"AAT"), Some(b'N'));
662        // ATG=M (start)
663        assert_eq!(code.translate_codon(b"ATG"), Some(b'M'));
664        // TAA=stop, TAG=stop, TGA=stop
665        assert_eq!(code.translate_codon(b"TAA"), None);
666        assert_eq!(code.translate_codon(b"TAG"), None);
667        assert_eq!(code.translate_codon(b"TGA"), None);
668        // TGG=W
669        assert_eq!(code.translate_codon(b"TGG"), Some(b'W'));
670        // TTT=F, TTC=F
671        assert_eq!(code.translate_codon(b"TTT"), Some(b'F'));
672        assert_eq!(code.translate_codon(b"TTC"), Some(b'F'));
673        // GGG=G
674        assert_eq!(code.translate_codon(b"GGG"), Some(b'G'));
675    }
676
677    #[test]
678    fn backward_compat_translate_codon() {
679        assert_eq!(translate_codon(b"ATG"), Some(b'M'));
680        assert_eq!(translate_codon(b"TAA"), None);
681        assert_eq!(translate_codon(b"UGG"), Some(b'W'));
682    }
683
684    #[test]
685    fn backward_compat_translate_sequence() {
686        let protein = translate_sequence(b"AUGUUUUAA");
687        assert_eq!(protein, b"MF");
688    }
689
690    #[test]
691    fn translate_dna_codons() {
692        assert_eq!(translate_codon(b"ATG"), Some(b'M'));
693        assert_eq!(translate_codon(b"TAA"), None);
694    }
695
696    #[test]
697    fn translate_sequence_basic() {
698        let protein = translate_sequence(b"AUGUUUUAA");
699        assert_eq!(protein, b"MF");
700    }
701
702    #[test]
703    fn translate_sequence_incomplete_codon_ignored() {
704        let protein = translate_sequence(b"AUGUUUAU");
705        assert_eq!(protein, b"MF");
706    }
707
708    #[test]
709    fn translate_sequence_full_includes_stops() {
710        let code = GeneticCode::standard();
711        let protein = code.translate_sequence_full(b"ATGTAAGCG");
712        assert_eq!(protein, &[b'M', b'*', b'A']);
713    }
714
715    #[test]
716    fn rna_codons_work() {
717        let code = GeneticCode::standard();
718        assert_eq!(code.translate_codon(b"AUG"), Some(b'M'));
719        assert_eq!(code.translate_codon(b"UAA"), None);
720    }
721
722    #[test]
723    fn vertebrate_mito_differences() {
724        let code = GeneticCode::from_id(GeneticCodeId::VertebrateMitochondrial);
725        // UGA=Trp (not stop)
726        assert_eq!(code.translate_codon(b"TGA"), Some(b'W'));
727        // AGA=Stop (not Arg)
728        assert_eq!(code.translate_codon(b"AGA"), None);
729        // AGG=Stop (not Arg)
730        assert_eq!(code.translate_codon(b"AGG"), None);
731        // AUA=Met (not Ile)
732        assert_eq!(code.translate_codon(b"ATA"), Some(b'M'));
733    }
734
735    #[test]
736    fn yeast_mito_differences() {
737        let code = GeneticCode::from_id(GeneticCodeId::YeastMitochondrial);
738        // CUG=Thr (not Leu)
739        assert_eq!(code.translate_codon(b"CTG"), Some(b'T'));
740        // CUA=Thr (not Leu)
741        assert_eq!(code.translate_codon(b"CTA"), Some(b'T'));
742        // UGA=Trp
743        assert_eq!(code.translate_codon(b"TGA"), Some(b'W'));
744        // AUA=Met
745        assert_eq!(code.translate_codon(b"ATA"), Some(b'M'));
746    }
747
748    #[test]
749    fn mycoplasma_differences() {
750        let code = GeneticCode::from_id(GeneticCodeId::MycoplasmaSpiroplasma);
751        // UGA=Trp (not stop)
752        assert_eq!(code.translate_codon(b"TGA"), Some(b'W'));
753        // Standard stops still apply
754        assert_eq!(code.translate_codon(b"TAA"), None);
755        assert_eq!(code.translate_codon(b"TAG"), None);
756    }
757
758    #[test]
759    fn invertebrate_mito_differences() {
760        let code = GeneticCode::from_id(GeneticCodeId::InvertebrateMitochondrial);
761        // AGA=Ser (not Arg)
762        assert_eq!(code.translate_codon(b"AGA"), Some(b'S'));
763        // AGG=Ser (not Arg)
764        assert_eq!(code.translate_codon(b"AGG"), Some(b'S'));
765        // UGA=Trp
766        assert_eq!(code.translate_codon(b"TGA"), Some(b'W'));
767    }
768
769    #[test]
770    fn ciliate_differences() {
771        let code = GeneticCode::from_id(GeneticCodeId::CiliateNuclear);
772        // UAA=Gln (not stop)
773        assert_eq!(code.translate_codon(b"TAA"), Some(b'Q'));
774        // UAG=Gln (not stop)
775        assert_eq!(code.translate_codon(b"TAG"), Some(b'Q'));
776        // UGA still stop
777        assert_eq!(code.translate_codon(b"TGA"), None);
778    }
779
780    #[test]
781    fn bacterial_plastid_same_aas_different_starts() {
782        let code = GeneticCode::from_id(GeneticCodeId::BacterialPlastid);
783        // Same AAs as standard
784        assert_eq!(code.translate_codon(b"ATG"), Some(b'M'));
785        assert_eq!(code.translate_codon(b"TGA"), None);
786        // GTG and TTG are starts
787        assert!(code.is_start(b"GTG"));
788        assert!(code.is_start(b"TTG"));
789        assert!(code.is_start(b"ATG"));
790    }
791
792    #[test]
793    fn start_stop_codons() {
794        let code = GeneticCode::standard();
795        assert!(code.is_start(b"ATG"));
796        assert!(!code.is_start(b"GTG"));
797        assert!(code.is_stop(b"TAA"));
798        assert!(code.is_stop(b"TAG"));
799        assert!(code.is_stop(b"TGA"));
800        assert!(!code.is_stop(b"ATG"));
801
802        let stops = code.stop_codons();
803        assert_eq!(stops.len(), 3);
804        let starts = code.start_codons();
805        assert_eq!(starts.len(), 1);
806    }
807
808    #[test]
809    fn codon_usage_basic() {
810        // ATGATG = two ATG codons
811        let usage = CodonUsage::from_sequence(b"ATGATG");
812        assert_eq!(usage.count(b"ATG"), 2);
813        assert_eq!(usage.total, 2);
814        assert!((usage.frequency(b"ATG") - 1.0).abs() < 1e-10);
815    }
816
817    #[test]
818    fn codon_usage_merge() {
819        let mut u1 = CodonUsage::from_sequence(b"ATGATG");
820        let u2 = CodonUsage::from_sequence(b"ATGAAA");
821        u1.merge(&u2);
822        assert_eq!(u1.count(b"ATG"), 3);
823        assert_eq!(u1.count(b"AAA"), 1);
824        assert_eq!(u1.total, 4);
825    }
826
827    #[test]
828    fn rscu_single_codon_family() {
829        let code = GeneticCode::standard();
830        // ATG is the only Met codon → RSCU = 1.0
831        let usage = CodonUsage::from_sequence(b"ATGATGATG");
832        let rscu = usage.rscu(b"ATG", &code);
833        assert!((rscu - 1.0).abs() < 1e-10);
834    }
835
836    #[test]
837    fn rscu_two_fold_degenerate() {
838        let code = GeneticCode::standard();
839        // Phe: TTC and TTT. If all are TTT, RSCU(TTT)=2.0, RSCU(TTC)=0.0
840        let usage = CodonUsage::from_sequence(b"TTTTTTTTT");
841        let rscu_ttt = usage.rscu(b"TTT", &code);
842        assert!((rscu_ttt - 2.0).abs() < 1e-10);
843        let rscu_ttc = usage.rscu(b"TTC", &code);
844        assert!(rscu_ttc.abs() < 1e-10);
845    }
846
847    #[test]
848    fn cai_high_adaptation() {
849        let code = GeneticCode::standard();
850        // Reference and query use same codons → CAI ≈ 1.0
851        let ref_seq = b"ATGTTTAAA";
852        let reference = CodonUsage::from_sequence(ref_seq);
853        let cai = codon_adaptation_index(ref_seq, &reference, &code).unwrap();
854        assert!((cai - 1.0).abs() < 0.01, "CAI={}", cai);
855    }
856
857    #[test]
858    fn cai_error_on_empty() {
859        let code = GeneticCode::standard();
860        let reference = CodonUsage::from_sequence(b"ATGATG");
861        let result = codon_adaptation_index(b"", &reference, &code);
862        assert!(result.is_err());
863    }
864
865    #[test]
866    fn classify_synonymous() {
867        let code = GeneticCode::standard();
868        // TTT→TTC: both Phe
869        assert_eq!(
870            classify_substitution(b"TTT", b"TTC", &code),
871            SubstitutionClass::Synonymous
872        );
873    }
874
875    #[test]
876    fn classify_nonsynonymous() {
877        let code = GeneticCode::standard();
878        // ATG(Met)→GTG(Val)
879        assert_eq!(
880            classify_substitution(b"ATG", b"GTG", &code),
881            SubstitutionClass::NonSynonymous
882        );
883    }
884
885    #[test]
886    fn classify_stop_involved() {
887        let code = GeneticCode::standard();
888        assert_eq!(
889            classify_substitution(b"ATG", b"TAG", &code),
890            SubstitutionClass::StopInvolved
891        );
892    }
893
894    #[test]
895    fn count_sites_basic() {
896        let code = GeneticCode::standard();
897        // ATG has 0 synonymous sites (only Met codon), 3 non-synonymous
898        let (s, n) = count_syn_nonsyn_sites(b"ATG", &code);
899        assert!(s < 0.5, "ATG syn sites should be ~0, got {}", s);
900        assert!(n > 2.5, "ATG nonsyn sites should be ~3, got {}", n);
901    }
902
903    #[test]
904    fn count_sites_four_fold() {
905        let code = GeneticCode::standard();
906        // GCN (Ala) — 4-fold degenerate at 3rd position
907        // Third position: 3 of 3 possible changes are synonymous → 1 syn site
908        let (s, _n) = count_syn_nonsyn_sites(b"GCA", &code);
909        assert!(s > 0.5, "Ala codon should have >0.5 syn sites, got {}", s);
910    }
911}
912
913#[cfg(test)]
914mod proptests {
915    use super::*;
916    use proptest::prelude::*;
917
918    fn coding_seq(max_codons: usize) -> impl Strategy<Value = Vec<u8>> {
919        proptest::collection::vec(
920            prop_oneof![Just(b'A'), Just(b'C'), Just(b'G'), Just(b'T')],
921            3..=(max_codons * 3),
922        )
923        .prop_map(|v| {
924            let len = v.len() - (v.len() % 3);
925            v[..len].to_vec()
926        })
927    }
928
929    proptest! {
930        #[test]
931        fn cai_in_unit_interval(seq in coding_seq(20)) {
932            let code = GeneticCode::standard();
933            let reference = CodonUsage::from_sequence(&seq);
934            if let Ok(cai) = codon_adaptation_index(&seq, &reference, &code) {
935                prop_assert!(cai > 0.0 && cai <= 1.0 + 1e-10,
936                    "CAI should be in (0,1], got {}", cai);
937            }
938        }
939    }
940}