Skip to main content

vareffect/
codon.rs

1//! Codon translation, DNA complement, and VEP-style display formatting.
2//!
3//! All functions operate on **uppercase ASCII bytes** (`b'A'`, `b'C'`, `b'G'`,
4//! `b'T'`). The codon lookup tables use an internal 6-bit encoding but callers
5//! never see it — pass raw ASCII in, get ASCII out.
6//!
7//! # Genetic codes
8//!
9//! Two translation tables are provided:
10//!
11//! - **Standard (NCBI table 1)** — used for all autosomal and sex-chromosome
12//!   transcripts. [`translate_codon`] uses this table.
13//! - **Vertebrate mitochondrial (NCBI table 2)** — used for chrM transcripts.
14//!   Four codons differ: `TGA→W`, `AGA→*`, `AGG→*`, `ATA→M`.
15//!
16//! [`translate_codon_for_transcript`] dispatches to the correct table based on
17//! an `is_mitochondrial` flag (derived from `transcript.chrom == "chrM"`).
18
19// ---------------------------------------------------------------------------
20// Internal encoding
21// ---------------------------------------------------------------------------
22
23/// Map an ASCII DNA base to a 2-bit index: A=0, C=1, G=2, T=3.
24/// Returns `None` for any non-ACGT byte (N, ambiguity codes, lowercase).
25fn base_to_index(b: u8) -> Option<usize> {
26    match b {
27        b'A' => Some(0),
28        b'C' => Some(1),
29        b'G' => Some(2),
30        b'T' => Some(3),
31        _ => None,
32    }
33}
34
35/// Pack a 3-base codon into a 6-bit index (0–63) for table lookup.
36/// Returns `None` if any base is not in {A, C, G, T}.
37fn codon_to_index(codon: &[u8; 3]) -> Option<usize> {
38    let a = base_to_index(codon[0])?;
39    let b = base_to_index(codon[1])?;
40    let c = base_to_index(codon[2])?;
41    Some(a * 16 + b * 4 + c)
42}
43
44// ---------------------------------------------------------------------------
45// Standard genetic code (NCBI translation table 1)
46// ---------------------------------------------------------------------------
47
48/// Standard genetic code. Index = `base1*16 + base2*4 + base3` where
49/// A=0, C=1, G=2, T=3. Each entry is the one-letter amino acid code;
50/// `b'*'` marks stop codons.
51///
52/// Layout (row = first two bases, column = third base):
53/// ```text
54///        A    C    G    T
55/// AA:    K    N    K    N
56/// AC:    T    T    T    T
57/// AG:    R    S    R    S
58/// AT:    I    I    M    I
59/// CA:    Q    H    Q    H
60/// CC:    P    P    P    P
61/// CG:    R    R    R    R
62/// CT:    L    L    L    L
63/// GA:    E    D    E    D
64/// GC:    A    A    A    A
65/// GG:    G    G    G    G
66/// GT:    V    V    V    V
67/// TA:    *    Y    *    Y
68/// TC:    S    S    S    S
69/// TG:    *    C    W    C
70/// TT:    L    F    L    F
71/// ```
72static STANDARD_TABLE: [u8; 64] = [
73    // AA AC AG AT  (first base = A)
74    b'K', b'N', b'K', b'N', // AA{A,C,G,T}
75    b'T', b'T', b'T', b'T', // AC{A,C,G,T}
76    b'R', b'S', b'R', b'S', // AG{A,C,G,T}
77    b'I', b'I', b'M', b'I', // AT{A,C,G,T}
78    // CA CC CG CT  (first base = C)
79    b'Q', b'H', b'Q', b'H', // CA{A,C,G,T}
80    b'P', b'P', b'P', b'P', // CC{A,C,G,T}
81    b'R', b'R', b'R', b'R', // CG{A,C,G,T}
82    b'L', b'L', b'L', b'L', // CT{A,C,G,T}
83    // GA GC GG GT  (first base = G)
84    b'E', b'D', b'E', b'D', // GA{A,C,G,T}
85    b'A', b'A', b'A', b'A', // GC{A,C,G,T}
86    b'G', b'G', b'G', b'G', // GG{A,C,G,T}
87    b'V', b'V', b'V', b'V', // GT{A,C,G,T}
88    // TA TC TG TT  (first base = T)
89    b'*', b'Y', b'*', b'Y', // TA{A,C,G,T}
90    b'S', b'S', b'S', b'S', // TC{A,C,G,T}
91    b'*', b'C', b'W', b'C', // TG{A,C,G,T}
92    b'L', b'F', b'L', b'F', // TT{A,C,G,T}
93];
94
95// ---------------------------------------------------------------------------
96// Vertebrate mitochondrial genetic code (NCBI translation table 2)
97// ---------------------------------------------------------------------------
98
99/// Vertebrate mitochondrial code. Differs from the standard table at 4 codons:
100/// - `TGA` → `W` (Trp, not stop)
101/// - `AGA` → `*` (stop, not Arg)
102/// - `AGG` → `*` (stop, not Arg)
103/// - `ATA` → `M` (Met, not Ile)
104static MITO_TABLE: [u8; 64] = [
105    // AA AC AG AT  (first base = A)
106    b'K', b'N', b'K', b'N', // AA{A,C,G,T}
107    b'T', b'T', b'T', b'T', // AC{A,C,G,T}
108    b'*', b'S', b'*', b'S', // AG{A,C,G,T}  ← AGA=*, AGG=*
109    b'M', b'I', b'M', b'I', // AT{A,C,G,T}  ← ATA=M
110    // CA CC CG CT  (first base = C)
111    b'Q', b'H', b'Q', b'H', // CA{A,C,G,T}
112    b'P', b'P', b'P', b'P', // CC{A,C,G,T}
113    b'R', b'R', b'R', b'R', // CG{A,C,G,T}
114    b'L', b'L', b'L', b'L', // CT{A,C,G,T}
115    // GA GC GG GT  (first base = G)
116    b'E', b'D', b'E', b'D', // GA{A,C,G,T}
117    b'A', b'A', b'A', b'A', // GC{A,C,G,T}
118    b'G', b'G', b'G', b'G', // GG{A,C,G,T}
119    b'V', b'V', b'V', b'V', // GT{A,C,G,T}
120    // TA TC TG TT  (first base = T)
121    b'*', b'Y', b'*', b'Y', // TA{A,C,G,T}
122    b'S', b'S', b'S', b'S', // TC{A,C,G,T}
123    b'W', b'C', b'W', b'C', // TG{A,C,G,T}  ← TGA=W (not stop), TGG=W
124    b'L', b'F', b'L', b'F', // TT{A,C,G,T}
125];
126
127// ---------------------------------------------------------------------------
128// Public translation API
129// ---------------------------------------------------------------------------
130
131/// Translate a 3-base codon to a single amino acid character using the
132/// standard genetic code (NCBI table 1).
133///
134/// # Arguments
135///
136/// * `codon` — Three uppercase ASCII DNA bases (e.g., `b"ATG"`)
137///
138/// # Returns
139///
140/// One-letter amino acid code as a `u8`:
141/// - Standard amino acids: `b'A'`..`b'Y'`
142/// - Stop codons (TAA, TAG, TGA): `b'*'`
143/// - Ambiguous codons (containing N or other non-ACGT bases): `b'X'`
144///
145/// # Examples
146///
147/// ```
148/// use vareffect::codon::translate_codon;
149/// assert_eq!(translate_codon(b"ATG"), b'M');
150/// assert_eq!(translate_codon(b"TAA"), b'*');
151/// assert_eq!(translate_codon(b"NNN"), b'X');
152/// ```
153pub fn translate_codon(codon: &[u8; 3]) -> u8 {
154    match codon_to_index(codon) {
155        Some(idx) => STANDARD_TABLE[idx],
156        None => b'X',
157    }
158}
159
160/// Translate a 3-base codon using the vertebrate mitochondrial genetic code
161/// (NCBI table 2).
162///
163/// Differs from [`translate_codon`] at four codons:
164/// `TGA→W`, `AGA→*`, `AGG→*`, `ATA→M`.
165///
166/// # Examples
167///
168/// ```
169/// use vareffect::codon::translate_codon_mito;
170/// assert_eq!(translate_codon_mito(b"TGA"), b'W');
171/// assert_eq!(translate_codon_mito(b"AGA"), b'*');
172/// ```
173pub fn translate_codon_mito(codon: &[u8; 3]) -> u8 {
174    match codon_to_index(codon) {
175        Some(idx) => MITO_TABLE[idx],
176        None => b'X',
177    }
178}
179
180/// Translate a codon using the appropriate genetic code for a transcript.
181///
182/// Dispatches to the vertebrate mitochondrial code (NCBI table 2) when
183/// `is_mitochondrial` is true, otherwise uses the standard code (table 1).
184///
185/// # Arguments
186///
187/// * `codon` — Three uppercase ASCII DNA bases
188/// * `is_mitochondrial` — `true` for chrM transcripts
189///
190/// # Examples
191///
192/// ```
193/// use vareffect::codon::translate_codon_for_transcript;
194/// // Standard code: TGA is stop
195/// assert_eq!(translate_codon_for_transcript(b"TGA", false), b'*');
196/// // Mitochondrial code: TGA is Trp
197/// assert_eq!(translate_codon_for_transcript(b"TGA", true), b'W');
198/// ```
199pub fn translate_codon_for_transcript(codon: &[u8; 3], is_mitochondrial: bool) -> u8 {
200    if is_mitochondrial {
201        translate_codon_mito(codon)
202    } else {
203        translate_codon(codon)
204    }
205}
206
207// ---------------------------------------------------------------------------
208// DNA complement
209// ---------------------------------------------------------------------------
210
211/// Complement a single DNA base. `A↔T`, `C↔G`. Non-ACGT bytes (e.g., `N`)
212/// pass through unchanged.
213///
214/// # Examples
215///
216/// ```
217/// use vareffect::codon::complement;
218/// assert_eq!(complement(b'A'), b'T');
219/// assert_eq!(complement(b'N'), b'N');
220/// ```
221pub fn complement(base: u8) -> u8 {
222    match base {
223        b'A' => b'T',
224        b'T' => b'A',
225        b'C' => b'G',
226        b'G' => b'C',
227        other => other,
228    }
229}
230
231/// Complement a DNA sequence in place. Each base is replaced with its
232/// Watson-Crick complement; non-ACGT bytes are left unchanged.
233pub fn complement_in_place(seq: &mut [u8]) {
234    for base in seq.iter_mut() {
235        *base = complement(*base);
236    }
237}
238
239/// Return the reverse complement of a DNA sequence.
240///
241/// # Examples
242///
243/// ```
244/// use vareffect::codon::reverse_complement;
245/// assert_eq!(reverse_complement(b"ATCG"), b"CGAT");
246/// ```
247pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
248    seq.iter().rev().map(|&b| complement(b)).collect()
249}
250
251// ---------------------------------------------------------------------------
252// Amino acid display helpers
253// ---------------------------------------------------------------------------
254
255/// Convert a one-letter amino acid code to its three-letter abbreviation.
256///
257/// Returns `"Ter"` for stop (`*`) and the standard three-letter code for
258/// the 20 standard amino acids. Any other byte (including the IUPAC `X`
259/// placeholder and invalid input) maps to `"Xaa"`, the IUPAC "any / unknown
260/// amino acid" symbol, so this function never panics.
261///
262/// # Examples
263///
264/// ```
265/// use vareffect::codon::aa_three_letter;
266/// assert_eq!(aa_three_letter(b'M'), "Met");
267/// assert_eq!(aa_three_letter(b'*'), "Ter");
268/// assert_eq!(aa_three_letter(b'X'), "Xaa");
269/// ```
270pub fn aa_three_letter(one_letter: u8) -> &'static str {
271    match one_letter {
272        b'A' => "Ala",
273        b'C' => "Cys",
274        b'D' => "Asp",
275        b'E' => "Glu",
276        b'F' => "Phe",
277        b'G' => "Gly",
278        b'H' => "His",
279        b'I' => "Ile",
280        b'K' => "Lys",
281        b'L' => "Leu",
282        b'M' => "Met",
283        b'N' => "Asn",
284        b'P' => "Pro",
285        b'Q' => "Gln",
286        b'R' => "Arg",
287        b'S' => "Ser",
288        b'T' => "Thr",
289        b'V' => "Val",
290        b'W' => "Trp",
291        b'Y' => "Tyr",
292        b'*' => "Ter",
293        // `X` is IUPAC for "any amino acid"; any other byte (including
294        // garbage input) falls through to the same sentinel — a public
295        // helper should never panic on caller-supplied data.
296        _ => "Xaa",
297    }
298}
299
300// ---------------------------------------------------------------------------
301// VEP-style display formatting
302// ---------------------------------------------------------------------------
303
304/// Format ref/alt codons with VEP's capitalization convention.
305///
306/// The changed base is uppercase; unchanged bases are lowercase. The two
307/// codons are separated by `/`.
308///
309/// # Arguments
310///
311/// * `ref_codon` — Reference codon (3 uppercase ASCII bases)
312/// * `alt_codon` — Alternate codon (3 uppercase ASCII bases)
313/// * `changed_pos` — 0-based position of the changed base (0, 1, or 2)
314///
315/// # Examples
316///
317/// ```
318/// use vareffect::codon::format_codons;
319/// assert_eq!(format_codons(b"CGT", b"TGT", 0), "Cgt/Tgt");
320/// assert_eq!(format_codons(b"CGT", b"CAT", 1), "cGt/cAt");
321/// assert_eq!(format_codons(b"CGT", b"CGA", 2), "cgT/cgA");
322/// ```
323pub fn format_codons(ref_codon: &[u8; 3], alt_codon: &[u8; 3], changed_pos: u8) -> String {
324    debug_assert!(changed_pos < 3, "codon position must be 0, 1, or 2");
325    let mut result = String::with_capacity(7); // "xxx/xxx"
326    for i in 0..3u8 {
327        if i == changed_pos {
328            result.push(ref_codon[i as usize] as char);
329        } else {
330            result.push((ref_codon[i as usize] as char).to_ascii_lowercase());
331        }
332    }
333    result.push('/');
334    for i in 0..3u8 {
335        if i == changed_pos {
336            result.push(alt_codon[i as usize] as char);
337        } else {
338            result.push((alt_codon[i as usize] as char).to_ascii_lowercase());
339        }
340    }
341    result
342}
343
344/// Format amino acid change for VEP display.
345///
346/// - Synonymous (ref == alt): single letter, e.g. `"R"`
347/// - Non-synonymous: `"R/W"` format
348/// - Stop gained: `"R/*"`
349///
350/// # Examples
351///
352/// ```
353/// use vareffect::codon::format_amino_acids;
354/// assert_eq!(format_amino_acids(b'R', b'W'), "R/W");
355/// assert_eq!(format_amino_acids(b'R', b'R'), "R");
356/// assert_eq!(format_amino_acids(b'R', b'*'), "R/*");
357/// ```
358pub fn format_amino_acids(ref_aa: u8, alt_aa: u8) -> String {
359    if ref_aa == alt_aa {
360        String::from(ref_aa as char)
361    } else {
362        format!("{}/{}", ref_aa as char, alt_aa as char)
363    }
364}
365
366// ---------------------------------------------------------------------------
367// Indel helpers
368// ---------------------------------------------------------------------------
369
370/// Translate a DNA sequence to amino acids, codon by codon.
371///
372/// Uses the appropriate genetic code based on `is_mitochondrial`.
373///
374/// # Arguments
375///
376/// * `seq` — Uppercase ASCII DNA bytes. Length must be divisible by 3.
377/// * `is_mitochondrial` — `true` for chrM transcripts (NCBI table 2)
378///
379/// # Returns
380///
381/// `Vec<u8>` of one-letter amino acid codes (same encoding as
382/// [`translate_codon`]).
383///
384/// # Errors
385///
386/// Returns [`crate::VarEffectError::Malformed`] if `seq.len() % 3 != 0`.
387pub fn translate_sequence(
388    seq: &[u8],
389    is_mitochondrial: bool,
390) -> Result<Vec<u8>, crate::VarEffectError> {
391    if !seq.len().is_multiple_of(3) {
392        return Err(crate::VarEffectError::Malformed(format!(
393            "translate_sequence: sequence length {} is not divisible by 3",
394            seq.len(),
395        )));
396    }
397    let mut aas = Vec::with_capacity(seq.len() / 3);
398    for codon_bytes in seq.chunks_exact(3) {
399        let codon: &[u8; 3] = codon_bytes
400            .try_into()
401            .expect("chunks_exact(3) always yields a 3-byte slice");
402        aas.push(translate_codon_for_transcript(codon, is_mitochondrial));
403    }
404    Ok(aas)
405}
406
407/// Format ref/alt codon sequences for an indel with VEP's capitalisation
408/// convention.
409///
410/// Bases in the ref that are deleted (positions `[changed_start, changed_end)`
411/// within `ref_seq`) are uppercase; all other ref bases are lowercase. In the
412/// alt, bases that were inserted (positions `[changed_start,
413/// changed_start + inserted_len)`) are uppercase; flanking bases are lowercase.
414///
415/// The two sequences are separated by `/`.
416///
417/// # Arguments
418///
419/// * `ref_seq` — Codon-aligned reference CDS bases
420/// * `alt_seq` — Codon-aligned alternate CDS bases (after deletion/insertion)
421/// * `changed_start` — 0-based index in `ref_seq` where the change begins
422/// * `changed_end` — 0-based exclusive end in `ref_seq` where the change ends
423///   (for deletions: the range of deleted bases; for insertions: typically
424///   `changed_start` since nothing is deleted in the ref)
425///
426/// # Examples
427///
428/// ```
429/// use vareffect::codon::format_codons_indel;
430/// // 3bp deletion at positions 3-5: "atgGAC/atg"
431/// assert_eq!(
432///     format_codons_indel(b"ATGGAC", b"ATG", 3, 6),
433///     "atgGAC/atg"
434/// );
435/// // 3bp insertion at position 1: "agc/aGACgc"
436/// assert_eq!(
437///     format_codons_indel(b"AGC", b"AGACGC", 1, 1),
438///     "agc/aGACgc"
439/// );
440/// ```
441pub fn format_codons_indel(
442    ref_seq: &[u8],
443    alt_seq: &[u8],
444    changed_start: usize,
445    changed_end: usize,
446) -> String {
447    let mut result = String::with_capacity(ref_seq.len() + alt_seq.len() + 2);
448
449    // Ref side: uppercase for deleted bases [changed_start, changed_end)
450    if ref_seq.is_empty() {
451        // Pure insertion with no ref codon context — VEP uses "-".
452        result.push('-');
453    } else {
454        for (i, &b) in ref_seq.iter().enumerate() {
455            if i >= changed_start && i < changed_end {
456                result.push(b as char);
457            } else {
458                result.push((b as char).to_ascii_lowercase());
459            }
460        }
461    }
462    result.push('/');
463
464    if alt_seq.is_empty() {
465        // Complete deletion — VEP uses "-" for the empty alt side.
466        result.push('-');
467    } else {
468        // Alt side: bases corresponding to the unchanged prefix and suffix are
469        // lowercase; inserted bases (those that don't correspond to the ref
470        // flanking regions) are uppercase.
471        let prefix_len = changed_start;
472        let suffix_len = ref_seq.len() - changed_end;
473        let alt_suffix_start = alt_seq.len().saturating_sub(suffix_len);
474
475        for (i, &b) in alt_seq.iter().enumerate() {
476            if i < prefix_len || i >= alt_suffix_start {
477                result.push((b as char).to_ascii_lowercase());
478            } else {
479                result.push(b as char);
480            }
481        }
482    }
483    result
484}
485
486/// Format amino acid change for indels.
487///
488/// Shows the full ref and alt amino acid sequences separated by `/`.
489/// Single-letter codes are used. If ref and alt are identical (synonymous
490/// indel at the protein level), returns a single copy. An empty side is
491/// represented as `-` (VEP convention).
492///
493/// # Examples
494///
495/// ```
496/// use vareffect::codon::format_amino_acids_indel;
497/// assert_eq!(format_amino_acids_indel(b"RR", b"R"), "RR/R");
498/// assert_eq!(format_amino_acids_indel(b"R", b"RDR"), "R/RDR");
499/// assert_eq!(format_amino_acids_indel(b"M", b""), "M/-");
500/// assert_eq!(format_amino_acids_indel(b"", b"X"), "-/X");
501/// ```
502pub fn format_amino_acids_indel(ref_aas: &[u8], alt_aas: &[u8]) -> String {
503    if ref_aas == alt_aas {
504        ref_aas.iter().map(|&b| b as char).collect()
505    } else {
506        let mut result = String::with_capacity(ref_aas.len() + alt_aas.len() + 2);
507        if ref_aas.is_empty() {
508            result.push('-');
509        } else {
510            for &b in ref_aas {
511                result.push(b as char);
512            }
513        }
514        result.push('/');
515        if alt_aas.is_empty() {
516            result.push('-');
517        } else {
518            for &b in alt_aas {
519                result.push(b as char);
520            }
521        }
522        result
523    }
524}
525
526// ---------------------------------------------------------------------------
527// Tests
528// ---------------------------------------------------------------------------
529
530#[cfg(test)]
531mod tests {
532    use super::*;
533
534    /// Verify every one of the 64 standard-code codons against the NCBI
535    /// translation table 1 reference.
536    #[test]
537    fn translate_all_64_codons() {
538        // (codon_bytes, expected_aa)
539        let expected: &[(&[u8; 3], u8)] = &[
540            (b"TTT", b'F'),
541            (b"TTC", b'F'),
542            (b"TTA", b'L'),
543            (b"TTG", b'L'),
544            (b"TCT", b'S'),
545            (b"TCC", b'S'),
546            (b"TCA", b'S'),
547            (b"TCG", b'S'),
548            (b"TAT", b'Y'),
549            (b"TAC", b'Y'),
550            (b"TAA", b'*'),
551            (b"TAG", b'*'),
552            (b"TGT", b'C'),
553            (b"TGC", b'C'),
554            (b"TGA", b'*'),
555            (b"TGG", b'W'),
556            (b"CTT", b'L'),
557            (b"CTC", b'L'),
558            (b"CTA", b'L'),
559            (b"CTG", b'L'),
560            (b"CCT", b'P'),
561            (b"CCC", b'P'),
562            (b"CCA", b'P'),
563            (b"CCG", b'P'),
564            (b"CAT", b'H'),
565            (b"CAC", b'H'),
566            (b"CAA", b'Q'),
567            (b"CAG", b'Q'),
568            (b"CGT", b'R'),
569            (b"CGC", b'R'),
570            (b"CGA", b'R'),
571            (b"CGG", b'R'),
572            (b"ATT", b'I'),
573            (b"ATC", b'I'),
574            (b"ATA", b'I'),
575            (b"ATG", b'M'),
576            (b"ACT", b'T'),
577            (b"ACC", b'T'),
578            (b"ACA", b'T'),
579            (b"ACG", b'T'),
580            (b"AAT", b'N'),
581            (b"AAC", b'N'),
582            (b"AAA", b'K'),
583            (b"AAG", b'K'),
584            (b"AGT", b'S'),
585            (b"AGC", b'S'),
586            (b"AGA", b'R'),
587            (b"AGG", b'R'),
588            (b"GTT", b'V'),
589            (b"GTC", b'V'),
590            (b"GTA", b'V'),
591            (b"GTG", b'V'),
592            (b"GCT", b'A'),
593            (b"GCC", b'A'),
594            (b"GCA", b'A'),
595            (b"GCG", b'A'),
596            (b"GAT", b'D'),
597            (b"GAC", b'D'),
598            (b"GAA", b'E'),
599            (b"GAG", b'E'),
600            (b"GGT", b'G'),
601            (b"GGC", b'G'),
602            (b"GGA", b'G'),
603            (b"GGG", b'G'),
604        ];
605        assert_eq!(expected.len(), 64);
606        for &(codon, aa) in expected {
607            assert_eq!(
608                translate_codon(codon),
609                aa,
610                "codon {} should translate to {} but got {}",
611                std::str::from_utf8(codon).unwrap(),
612                aa as char,
613                translate_codon(codon) as char,
614            );
615        }
616    }
617
618    /// The three standard stop codons should all translate to `*`.
619    #[test]
620    fn translate_stop_codons() {
621        assert_eq!(translate_codon(b"TAA"), b'*');
622        assert_eq!(translate_codon(b"TAG"), b'*');
623        assert_eq!(translate_codon(b"TGA"), b'*');
624    }
625
626    /// Codons containing non-ACGT bases should translate to `X` (unknown).
627    #[test]
628    fn translate_ambiguous_codon() {
629        assert_eq!(translate_codon(b"NNN"), b'X');
630        assert_eq!(translate_codon(b"ANG"), b'X');
631        assert_eq!(translate_codon(b"ATN"), b'X');
632        // Lowercase is also non-ACGT in our convention (input must be uppercase)
633        assert_eq!(translate_codon(b"atg"), b'X');
634    }
635
636    /// Verify the 4 codons that differ between the standard and mitochondrial
637    /// genetic codes (NCBI table 2).
638    #[test]
639    fn translate_mitochondrial_differences() {
640        // TGA: standard = stop, mito = Trp
641        assert_eq!(translate_codon(b"TGA"), b'*');
642        assert_eq!(translate_codon_mito(b"TGA"), b'W');
643
644        // AGA: standard = Arg, mito = stop
645        assert_eq!(translate_codon(b"AGA"), b'R');
646        assert_eq!(translate_codon_mito(b"AGA"), b'*');
647
648        // AGG: standard = Arg, mito = stop
649        assert_eq!(translate_codon(b"AGG"), b'R');
650        assert_eq!(translate_codon_mito(b"AGG"), b'*');
651
652        // ATA: standard = Ile, mito = Met
653        assert_eq!(translate_codon(b"ATA"), b'I');
654        assert_eq!(translate_codon_mito(b"ATA"), b'M');
655
656        // Verify dispatch helper
657        assert_eq!(translate_codon_for_transcript(b"TGA", false), b'*');
658        assert_eq!(translate_codon_for_transcript(b"TGA", true), b'W');
659    }
660
661    /// Watson-Crick complement: A↔T, C↔G, N passes through.
662    #[test]
663    fn complement_bases() {
664        assert_eq!(complement(b'A'), b'T');
665        assert_eq!(complement(b'T'), b'A');
666        assert_eq!(complement(b'C'), b'G');
667        assert_eq!(complement(b'G'), b'C');
668        assert_eq!(complement(b'N'), b'N');
669    }
670
671    /// Reverse complement of a 10-base sequence.
672    #[test]
673    fn reverse_complement_sequence() {
674        let seq = b"ATCGATCGAT";
675        let rc = reverse_complement(seq);
676        assert_eq!(rc, b"ATCGATCGAT");
677
678        let seq2 = b"AACCGGTT";
679        let rc2 = reverse_complement(seq2);
680        assert_eq!(rc2, b"AACCGGTT");
681
682        // Asymmetric sequence
683        let seq3 = b"AAACCCGGGT";
684        let rc3 = reverse_complement(seq3);
685        assert_eq!(rc3, b"ACCCGGGTTT");
686
687        // complement_in_place
688        let mut buf = b"ACGT".to_vec();
689        complement_in_place(&mut buf);
690        assert_eq!(buf, b"TGCA");
691    }
692
693    /// Verify all 20 standard amino acids + Ter + Xaa three-letter codes.
694    #[test]
695    fn aa_three_letter_all_20() {
696        let cases: &[(u8, &str)] = &[
697            (b'A', "Ala"),
698            (b'C', "Cys"),
699            (b'D', "Asp"),
700            (b'E', "Glu"),
701            (b'F', "Phe"),
702            (b'G', "Gly"),
703            (b'H', "His"),
704            (b'I', "Ile"),
705            (b'K', "Lys"),
706            (b'L', "Leu"),
707            (b'M', "Met"),
708            (b'N', "Asn"),
709            (b'P', "Pro"),
710            (b'Q', "Gln"),
711            (b'R', "Arg"),
712            (b'S', "Ser"),
713            (b'T', "Thr"),
714            (b'V', "Val"),
715            (b'W', "Trp"),
716            (b'Y', "Tyr"),
717            (b'*', "Ter"),
718            (b'X', "Xaa"),
719        ];
720        for &(code, expected) in cases {
721            assert_eq!(
722                aa_three_letter(code),
723                expected,
724                "aa_three_letter({}) should be {}",
725                code as char,
726                expected,
727            );
728        }
729    }
730
731    /// VEP codon formatting: changed base at position 0 is uppercase.
732    #[test]
733    fn format_codons_position_0() {
734        assert_eq!(format_codons(b"CGT", b"TGT", 0), "Cgt/Tgt");
735    }
736
737    /// VEP codon formatting: changed base at position 1 is uppercase.
738    #[test]
739    fn format_codons_position_1() {
740        assert_eq!(format_codons(b"CGT", b"CAT", 1), "cGt/cAt");
741    }
742
743    /// VEP codon formatting: changed base at position 2 is uppercase.
744    #[test]
745    fn format_codons_position_2() {
746        assert_eq!(format_codons(b"CGT", b"CGA", 2), "cgT/cgA");
747    }
748}