vareffect/codon.rs
1//! Codon translation, DNA complement, and VEP-style display formatting.
2//!
3//! All functions operate on **uppercase ASCII bytes** (`b'A'`, `b'C'`, `b'G'`,
4//! `b'T'`). The codon lookup tables use an internal 6-bit encoding but callers
5//! never see it — pass raw ASCII in, get ASCII out.
6//!
7//! # Genetic codes
8//!
9//! Two translation tables are provided:
10//!
11//! - **Standard (NCBI table 1)** — used for all autosomal and sex-chromosome
12//! transcripts. [`translate_codon`] uses this table.
13//! - **Vertebrate mitochondrial (NCBI table 2)** — used for chrM transcripts.
14//! Four codons differ: `TGA→W`, `AGA→*`, `AGG→*`, `ATA→M`.
15//!
16//! [`translate_codon_for_transcript`] dispatches to the correct table based on
17//! an `is_mitochondrial` flag (derived from `transcript.chrom == "chrM"`).
18
19// ---------------------------------------------------------------------------
20// Internal encoding
21// ---------------------------------------------------------------------------
22
23/// Map an ASCII DNA base to a 2-bit index: A=0, C=1, G=2, T=3.
24/// Returns `None` for any non-ACGT byte (N, ambiguity codes, lowercase).
25fn base_to_index(b: u8) -> Option<usize> {
26 match b {
27 b'A' => Some(0),
28 b'C' => Some(1),
29 b'G' => Some(2),
30 b'T' => Some(3),
31 _ => None,
32 }
33}
34
35/// Pack a 3-base codon into a 6-bit index (0–63) for table lookup.
36/// Returns `None` if any base is not in {A, C, G, T}.
37fn codon_to_index(codon: &[u8; 3]) -> Option<usize> {
38 let a = base_to_index(codon[0])?;
39 let b = base_to_index(codon[1])?;
40 let c = base_to_index(codon[2])?;
41 Some(a * 16 + b * 4 + c)
42}
43
44// ---------------------------------------------------------------------------
45// Standard genetic code (NCBI translation table 1)
46// ---------------------------------------------------------------------------
47
48/// Standard genetic code. Index = `base1*16 + base2*4 + base3` where
49/// A=0, C=1, G=2, T=3. Each entry is the one-letter amino acid code;
50/// `b'*'` marks stop codons.
51///
52/// Layout (row = first two bases, column = third base):
53/// ```text
54/// A C G T
55/// AA: K N K N
56/// AC: T T T T
57/// AG: R S R S
58/// AT: I I M I
59/// CA: Q H Q H
60/// CC: P P P P
61/// CG: R R R R
62/// CT: L L L L
63/// GA: E D E D
64/// GC: A A A A
65/// GG: G G G G
66/// GT: V V V V
67/// TA: * Y * Y
68/// TC: S S S S
69/// TG: * C W C
70/// TT: L F L F
71/// ```
72static STANDARD_TABLE: [u8; 64] = [
73 // AA AC AG AT (first base = A)
74 b'K', b'N', b'K', b'N', // AA{A,C,G,T}
75 b'T', b'T', b'T', b'T', // AC{A,C,G,T}
76 b'R', b'S', b'R', b'S', // AG{A,C,G,T}
77 b'I', b'I', b'M', b'I', // AT{A,C,G,T}
78 // CA CC CG CT (first base = C)
79 b'Q', b'H', b'Q', b'H', // CA{A,C,G,T}
80 b'P', b'P', b'P', b'P', // CC{A,C,G,T}
81 b'R', b'R', b'R', b'R', // CG{A,C,G,T}
82 b'L', b'L', b'L', b'L', // CT{A,C,G,T}
83 // GA GC GG GT (first base = G)
84 b'E', b'D', b'E', b'D', // GA{A,C,G,T}
85 b'A', b'A', b'A', b'A', // GC{A,C,G,T}
86 b'G', b'G', b'G', b'G', // GG{A,C,G,T}
87 b'V', b'V', b'V', b'V', // GT{A,C,G,T}
88 // TA TC TG TT (first base = T)
89 b'*', b'Y', b'*', b'Y', // TA{A,C,G,T}
90 b'S', b'S', b'S', b'S', // TC{A,C,G,T}
91 b'*', b'C', b'W', b'C', // TG{A,C,G,T}
92 b'L', b'F', b'L', b'F', // TT{A,C,G,T}
93];
94
95// ---------------------------------------------------------------------------
96// Vertebrate mitochondrial genetic code (NCBI translation table 2)
97// ---------------------------------------------------------------------------
98
99/// Vertebrate mitochondrial code. Differs from the standard table at 4 codons:
100/// - `TGA` → `W` (Trp, not stop)
101/// - `AGA` → `*` (stop, not Arg)
102/// - `AGG` → `*` (stop, not Arg)
103/// - `ATA` → `M` (Met, not Ile)
104static MITO_TABLE: [u8; 64] = [
105 // AA AC AG AT (first base = A)
106 b'K', b'N', b'K', b'N', // AA{A,C,G,T}
107 b'T', b'T', b'T', b'T', // AC{A,C,G,T}
108 b'*', b'S', b'*', b'S', // AG{A,C,G,T} ← AGA=*, AGG=*
109 b'M', b'I', b'M', b'I', // AT{A,C,G,T} ← ATA=M
110 // CA CC CG CT (first base = C)
111 b'Q', b'H', b'Q', b'H', // CA{A,C,G,T}
112 b'P', b'P', b'P', b'P', // CC{A,C,G,T}
113 b'R', b'R', b'R', b'R', // CG{A,C,G,T}
114 b'L', b'L', b'L', b'L', // CT{A,C,G,T}
115 // GA GC GG GT (first base = G)
116 b'E', b'D', b'E', b'D', // GA{A,C,G,T}
117 b'A', b'A', b'A', b'A', // GC{A,C,G,T}
118 b'G', b'G', b'G', b'G', // GG{A,C,G,T}
119 b'V', b'V', b'V', b'V', // GT{A,C,G,T}
120 // TA TC TG TT (first base = T)
121 b'*', b'Y', b'*', b'Y', // TA{A,C,G,T}
122 b'S', b'S', b'S', b'S', // TC{A,C,G,T}
123 b'W', b'C', b'W', b'C', // TG{A,C,G,T} ← TGA=W (not stop), TGG=W
124 b'L', b'F', b'L', b'F', // TT{A,C,G,T}
125];
126
127// ---------------------------------------------------------------------------
128// Public translation API
129// ---------------------------------------------------------------------------
130
131/// Translate a 3-base codon to a single amino acid character using the
132/// standard genetic code (NCBI table 1).
133///
134/// # Arguments
135///
136/// * `codon` — Three uppercase ASCII DNA bases (e.g., `b"ATG"`)
137///
138/// # Returns
139///
140/// One-letter amino acid code as a `u8`:
141/// - Standard amino acids: `b'A'`..`b'Y'`
142/// - Stop codons (TAA, TAG, TGA): `b'*'`
143/// - Ambiguous codons (containing N or other non-ACGT bases): `b'X'`
144///
145/// # Examples
146///
147/// ```
148/// use vareffect::codon::translate_codon;
149/// assert_eq!(translate_codon(b"ATG"), b'M');
150/// assert_eq!(translate_codon(b"TAA"), b'*');
151/// assert_eq!(translate_codon(b"NNN"), b'X');
152/// ```
153pub fn translate_codon(codon: &[u8; 3]) -> u8 {
154 match codon_to_index(codon) {
155 Some(idx) => STANDARD_TABLE[idx],
156 None => b'X',
157 }
158}
159
160/// Translate a 3-base codon using the vertebrate mitochondrial genetic code
161/// (NCBI table 2).
162///
163/// Differs from [`translate_codon`] at four codons:
164/// `TGA→W`, `AGA→*`, `AGG→*`, `ATA→M`.
165///
166/// # Examples
167///
168/// ```
169/// use vareffect::codon::translate_codon_mito;
170/// assert_eq!(translate_codon_mito(b"TGA"), b'W');
171/// assert_eq!(translate_codon_mito(b"AGA"), b'*');
172/// ```
173pub fn translate_codon_mito(codon: &[u8; 3]) -> u8 {
174 match codon_to_index(codon) {
175 Some(idx) => MITO_TABLE[idx],
176 None => b'X',
177 }
178}
179
180/// Translate a codon using the appropriate genetic code for a transcript.
181///
182/// Dispatches to the vertebrate mitochondrial code (NCBI table 2) when
183/// `is_mitochondrial` is true, otherwise uses the standard code (table 1).
184///
185/// # Arguments
186///
187/// * `codon` — Three uppercase ASCII DNA bases
188/// * `is_mitochondrial` — `true` for chrM transcripts
189///
190/// # Examples
191///
192/// ```
193/// use vareffect::codon::translate_codon_for_transcript;
194/// // Standard code: TGA is stop
195/// assert_eq!(translate_codon_for_transcript(b"TGA", false), b'*');
196/// // Mitochondrial code: TGA is Trp
197/// assert_eq!(translate_codon_for_transcript(b"TGA", true), b'W');
198/// ```
199pub fn translate_codon_for_transcript(codon: &[u8; 3], is_mitochondrial: bool) -> u8 {
200 if is_mitochondrial {
201 translate_codon_mito(codon)
202 } else {
203 translate_codon(codon)
204 }
205}
206
207// ---------------------------------------------------------------------------
208// DNA complement
209// ---------------------------------------------------------------------------
210
211/// Complement a single DNA base. `A↔T`, `C↔G`. Non-ACGT bytes (e.g., `N`)
212/// pass through unchanged.
213///
214/// # Examples
215///
216/// ```
217/// use vareffect::codon::complement;
218/// assert_eq!(complement(b'A'), b'T');
219/// assert_eq!(complement(b'N'), b'N');
220/// ```
221pub fn complement(base: u8) -> u8 {
222 match base {
223 b'A' => b'T',
224 b'T' => b'A',
225 b'C' => b'G',
226 b'G' => b'C',
227 other => other,
228 }
229}
230
231/// Complement a DNA sequence in place. Each base is replaced with its
232/// Watson-Crick complement; non-ACGT bytes are left unchanged.
233pub fn complement_in_place(seq: &mut [u8]) {
234 for base in seq.iter_mut() {
235 *base = complement(*base);
236 }
237}
238
239/// Return the reverse complement of a DNA sequence.
240///
241/// # Examples
242///
243/// ```
244/// use vareffect::codon::reverse_complement;
245/// assert_eq!(reverse_complement(b"ATCG"), b"CGAT");
246/// ```
247pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
248 seq.iter().rev().map(|&b| complement(b)).collect()
249}
250
251// ---------------------------------------------------------------------------
252// Amino acid display helpers
253// ---------------------------------------------------------------------------
254
255/// Convert a one-letter amino acid code to its three-letter abbreviation.
256///
257/// Returns `"Ter"` for stop (`*`) and the standard three-letter code for
258/// the 20 standard amino acids. Any other byte (including the IUPAC `X`
259/// placeholder and invalid input) maps to `"Xaa"`, the IUPAC "any / unknown
260/// amino acid" symbol, so this function never panics.
261///
262/// # Examples
263///
264/// ```
265/// use vareffect::codon::aa_three_letter;
266/// assert_eq!(aa_three_letter(b'M'), "Met");
267/// assert_eq!(aa_three_letter(b'*'), "Ter");
268/// assert_eq!(aa_three_letter(b'X'), "Xaa");
269/// ```
270pub fn aa_three_letter(one_letter: u8) -> &'static str {
271 match one_letter {
272 b'A' => "Ala",
273 b'C' => "Cys",
274 b'D' => "Asp",
275 b'E' => "Glu",
276 b'F' => "Phe",
277 b'G' => "Gly",
278 b'H' => "His",
279 b'I' => "Ile",
280 b'K' => "Lys",
281 b'L' => "Leu",
282 b'M' => "Met",
283 b'N' => "Asn",
284 b'P' => "Pro",
285 b'Q' => "Gln",
286 b'R' => "Arg",
287 b'S' => "Ser",
288 b'T' => "Thr",
289 b'V' => "Val",
290 b'W' => "Trp",
291 b'Y' => "Tyr",
292 b'*' => "Ter",
293 // `X` is IUPAC for "any amino acid"; any other byte (including
294 // garbage input) falls through to the same sentinel — a public
295 // helper should never panic on caller-supplied data.
296 _ => "Xaa",
297 }
298}
299
300// ---------------------------------------------------------------------------
301// VEP-style display formatting
302// ---------------------------------------------------------------------------
303
304/// Format ref/alt codons with VEP's capitalization convention.
305///
306/// The changed base is uppercase; unchanged bases are lowercase. The two
307/// codons are separated by `/`.
308///
309/// # Arguments
310///
311/// * `ref_codon` — Reference codon (3 uppercase ASCII bases)
312/// * `alt_codon` — Alternate codon (3 uppercase ASCII bases)
313/// * `changed_pos` — 0-based position of the changed base (0, 1, or 2)
314///
315/// # Examples
316///
317/// ```
318/// use vareffect::codon::format_codons;
319/// assert_eq!(format_codons(b"CGT", b"TGT", 0), "Cgt/Tgt");
320/// assert_eq!(format_codons(b"CGT", b"CAT", 1), "cGt/cAt");
321/// assert_eq!(format_codons(b"CGT", b"CGA", 2), "cgT/cgA");
322/// ```
323pub fn format_codons(ref_codon: &[u8; 3], alt_codon: &[u8; 3], changed_pos: u8) -> String {
324 debug_assert!(changed_pos < 3, "codon position must be 0, 1, or 2");
325 let mut result = String::with_capacity(7); // "xxx/xxx"
326 for i in 0..3u8 {
327 if i == changed_pos {
328 result.push(ref_codon[i as usize] as char);
329 } else {
330 result.push((ref_codon[i as usize] as char).to_ascii_lowercase());
331 }
332 }
333 result.push('/');
334 for i in 0..3u8 {
335 if i == changed_pos {
336 result.push(alt_codon[i as usize] as char);
337 } else {
338 result.push((alt_codon[i as usize] as char).to_ascii_lowercase());
339 }
340 }
341 result
342}
343
344/// Format amino acid change for VEP display.
345///
346/// - Synonymous (ref == alt): single letter, e.g. `"R"`
347/// - Non-synonymous: `"R/W"` format
348/// - Stop gained: `"R/*"`
349///
350/// # Examples
351///
352/// ```
353/// use vareffect::codon::format_amino_acids;
354/// assert_eq!(format_amino_acids(b'R', b'W'), "R/W");
355/// assert_eq!(format_amino_acids(b'R', b'R'), "R");
356/// assert_eq!(format_amino_acids(b'R', b'*'), "R/*");
357/// ```
358pub fn format_amino_acids(ref_aa: u8, alt_aa: u8) -> String {
359 if ref_aa == alt_aa {
360 String::from(ref_aa as char)
361 } else {
362 format!("{}/{}", ref_aa as char, alt_aa as char)
363 }
364}
365
366// ---------------------------------------------------------------------------
367// Indel helpers
368// ---------------------------------------------------------------------------
369
370/// Translate a DNA sequence to amino acids, codon by codon.
371///
372/// Uses the appropriate genetic code based on `is_mitochondrial`.
373///
374/// # Arguments
375///
376/// * `seq` — Uppercase ASCII DNA bytes. Length must be divisible by 3.
377/// * `is_mitochondrial` — `true` for chrM transcripts (NCBI table 2)
378///
379/// # Returns
380///
381/// `Vec<u8>` of one-letter amino acid codes (same encoding as
382/// [`translate_codon`]).
383///
384/// # Errors
385///
386/// Returns [`crate::VarEffectError::Malformed`] if `seq.len() % 3 != 0`.
387pub fn translate_sequence(
388 seq: &[u8],
389 is_mitochondrial: bool,
390) -> Result<Vec<u8>, crate::VarEffectError> {
391 if !seq.len().is_multiple_of(3) {
392 return Err(crate::VarEffectError::Malformed(format!(
393 "translate_sequence: sequence length {} is not divisible by 3",
394 seq.len(),
395 )));
396 }
397 let mut aas = Vec::with_capacity(seq.len() / 3);
398 for codon_bytes in seq.chunks_exact(3) {
399 let codon: &[u8; 3] = codon_bytes
400 .try_into()
401 .expect("chunks_exact(3) always yields a 3-byte slice");
402 aas.push(translate_codon_for_transcript(codon, is_mitochondrial));
403 }
404 Ok(aas)
405}
406
407/// Format ref/alt codon sequences for an indel with VEP's capitalisation
408/// convention.
409///
410/// Bases in the ref that are deleted (positions `[changed_start, changed_end)`
411/// within `ref_seq`) are uppercase; all other ref bases are lowercase. In the
412/// alt, bases that were inserted (positions `[changed_start,
413/// changed_start + inserted_len)`) are uppercase; flanking bases are lowercase.
414///
415/// The two sequences are separated by `/`.
416///
417/// # Arguments
418///
419/// * `ref_seq` — Codon-aligned reference CDS bases
420/// * `alt_seq` — Codon-aligned alternate CDS bases (after deletion/insertion)
421/// * `changed_start` — 0-based index in `ref_seq` where the change begins
422/// * `changed_end` — 0-based exclusive end in `ref_seq` where the change ends
423/// (for deletions: the range of deleted bases; for insertions: typically
424/// `changed_start` since nothing is deleted in the ref)
425///
426/// # Examples
427///
428/// ```
429/// use vareffect::codon::format_codons_indel;
430/// // 3bp deletion at positions 3-5: "atgGAC/atg"
431/// assert_eq!(
432/// format_codons_indel(b"ATGGAC", b"ATG", 3, 6),
433/// "atgGAC/atg"
434/// );
435/// // 3bp insertion at position 1: "agc/aGACgc"
436/// assert_eq!(
437/// format_codons_indel(b"AGC", b"AGACGC", 1, 1),
438/// "agc/aGACgc"
439/// );
440/// ```
441pub fn format_codons_indel(
442 ref_seq: &[u8],
443 alt_seq: &[u8],
444 changed_start: usize,
445 changed_end: usize,
446) -> String {
447 let mut result = String::with_capacity(ref_seq.len() + alt_seq.len() + 2);
448
449 // Ref side: uppercase for deleted bases [changed_start, changed_end)
450 if ref_seq.is_empty() {
451 // Pure insertion with no ref codon context — VEP uses "-".
452 result.push('-');
453 } else {
454 for (i, &b) in ref_seq.iter().enumerate() {
455 if i >= changed_start && i < changed_end {
456 result.push(b as char);
457 } else {
458 result.push((b as char).to_ascii_lowercase());
459 }
460 }
461 }
462 result.push('/');
463
464 if alt_seq.is_empty() {
465 // Complete deletion — VEP uses "-" for the empty alt side.
466 result.push('-');
467 } else {
468 // Alt side: bases corresponding to the unchanged prefix and suffix are
469 // lowercase; inserted bases (those that don't correspond to the ref
470 // flanking regions) are uppercase.
471 let prefix_len = changed_start;
472 let suffix_len = ref_seq.len() - changed_end;
473 let alt_suffix_start = alt_seq.len().saturating_sub(suffix_len);
474
475 for (i, &b) in alt_seq.iter().enumerate() {
476 if i < prefix_len || i >= alt_suffix_start {
477 result.push((b as char).to_ascii_lowercase());
478 } else {
479 result.push(b as char);
480 }
481 }
482 }
483 result
484}
485
486/// Format amino acid change for indels.
487///
488/// Shows the full ref and alt amino acid sequences separated by `/`.
489/// Single-letter codes are used. If ref and alt are identical (synonymous
490/// indel at the protein level), returns a single copy. An empty side is
491/// represented as `-` (VEP convention).
492///
493/// # Examples
494///
495/// ```
496/// use vareffect::codon::format_amino_acids_indel;
497/// assert_eq!(format_amino_acids_indel(b"RR", b"R"), "RR/R");
498/// assert_eq!(format_amino_acids_indel(b"R", b"RDR"), "R/RDR");
499/// assert_eq!(format_amino_acids_indel(b"M", b""), "M/-");
500/// assert_eq!(format_amino_acids_indel(b"", b"X"), "-/X");
501/// ```
502pub fn format_amino_acids_indel(ref_aas: &[u8], alt_aas: &[u8]) -> String {
503 if ref_aas == alt_aas {
504 ref_aas.iter().map(|&b| b as char).collect()
505 } else {
506 let mut result = String::with_capacity(ref_aas.len() + alt_aas.len() + 2);
507 if ref_aas.is_empty() {
508 result.push('-');
509 } else {
510 for &b in ref_aas {
511 result.push(b as char);
512 }
513 }
514 result.push('/');
515 if alt_aas.is_empty() {
516 result.push('-');
517 } else {
518 for &b in alt_aas {
519 result.push(b as char);
520 }
521 }
522 result
523 }
524}
525
526// ---------------------------------------------------------------------------
527// Tests
528// ---------------------------------------------------------------------------
529
530#[cfg(test)]
531mod tests {
532 use super::*;
533
534 /// Verify every one of the 64 standard-code codons against the NCBI
535 /// translation table 1 reference.
536 #[test]
537 fn translate_all_64_codons() {
538 // (codon_bytes, expected_aa)
539 let expected: &[(&[u8; 3], u8)] = &[
540 (b"TTT", b'F'),
541 (b"TTC", b'F'),
542 (b"TTA", b'L'),
543 (b"TTG", b'L'),
544 (b"TCT", b'S'),
545 (b"TCC", b'S'),
546 (b"TCA", b'S'),
547 (b"TCG", b'S'),
548 (b"TAT", b'Y'),
549 (b"TAC", b'Y'),
550 (b"TAA", b'*'),
551 (b"TAG", b'*'),
552 (b"TGT", b'C'),
553 (b"TGC", b'C'),
554 (b"TGA", b'*'),
555 (b"TGG", b'W'),
556 (b"CTT", b'L'),
557 (b"CTC", b'L'),
558 (b"CTA", b'L'),
559 (b"CTG", b'L'),
560 (b"CCT", b'P'),
561 (b"CCC", b'P'),
562 (b"CCA", b'P'),
563 (b"CCG", b'P'),
564 (b"CAT", b'H'),
565 (b"CAC", b'H'),
566 (b"CAA", b'Q'),
567 (b"CAG", b'Q'),
568 (b"CGT", b'R'),
569 (b"CGC", b'R'),
570 (b"CGA", b'R'),
571 (b"CGG", b'R'),
572 (b"ATT", b'I'),
573 (b"ATC", b'I'),
574 (b"ATA", b'I'),
575 (b"ATG", b'M'),
576 (b"ACT", b'T'),
577 (b"ACC", b'T'),
578 (b"ACA", b'T'),
579 (b"ACG", b'T'),
580 (b"AAT", b'N'),
581 (b"AAC", b'N'),
582 (b"AAA", b'K'),
583 (b"AAG", b'K'),
584 (b"AGT", b'S'),
585 (b"AGC", b'S'),
586 (b"AGA", b'R'),
587 (b"AGG", b'R'),
588 (b"GTT", b'V'),
589 (b"GTC", b'V'),
590 (b"GTA", b'V'),
591 (b"GTG", b'V'),
592 (b"GCT", b'A'),
593 (b"GCC", b'A'),
594 (b"GCA", b'A'),
595 (b"GCG", b'A'),
596 (b"GAT", b'D'),
597 (b"GAC", b'D'),
598 (b"GAA", b'E'),
599 (b"GAG", b'E'),
600 (b"GGT", b'G'),
601 (b"GGC", b'G'),
602 (b"GGA", b'G'),
603 (b"GGG", b'G'),
604 ];
605 assert_eq!(expected.len(), 64);
606 for &(codon, aa) in expected {
607 assert_eq!(
608 translate_codon(codon),
609 aa,
610 "codon {} should translate to {} but got {}",
611 std::str::from_utf8(codon).unwrap(),
612 aa as char,
613 translate_codon(codon) as char,
614 );
615 }
616 }
617
618 /// The three standard stop codons should all translate to `*`.
619 #[test]
620 fn translate_stop_codons() {
621 assert_eq!(translate_codon(b"TAA"), b'*');
622 assert_eq!(translate_codon(b"TAG"), b'*');
623 assert_eq!(translate_codon(b"TGA"), b'*');
624 }
625
626 /// Codons containing non-ACGT bases should translate to `X` (unknown).
627 #[test]
628 fn translate_ambiguous_codon() {
629 assert_eq!(translate_codon(b"NNN"), b'X');
630 assert_eq!(translate_codon(b"ANG"), b'X');
631 assert_eq!(translate_codon(b"ATN"), b'X');
632 // Lowercase is also non-ACGT in our convention (input must be uppercase)
633 assert_eq!(translate_codon(b"atg"), b'X');
634 }
635
636 /// Verify the 4 codons that differ between the standard and mitochondrial
637 /// genetic codes (NCBI table 2).
638 #[test]
639 fn translate_mitochondrial_differences() {
640 // TGA: standard = stop, mito = Trp
641 assert_eq!(translate_codon(b"TGA"), b'*');
642 assert_eq!(translate_codon_mito(b"TGA"), b'W');
643
644 // AGA: standard = Arg, mito = stop
645 assert_eq!(translate_codon(b"AGA"), b'R');
646 assert_eq!(translate_codon_mito(b"AGA"), b'*');
647
648 // AGG: standard = Arg, mito = stop
649 assert_eq!(translate_codon(b"AGG"), b'R');
650 assert_eq!(translate_codon_mito(b"AGG"), b'*');
651
652 // ATA: standard = Ile, mito = Met
653 assert_eq!(translate_codon(b"ATA"), b'I');
654 assert_eq!(translate_codon_mito(b"ATA"), b'M');
655
656 // Verify dispatch helper
657 assert_eq!(translate_codon_for_transcript(b"TGA", false), b'*');
658 assert_eq!(translate_codon_for_transcript(b"TGA", true), b'W');
659 }
660
661 /// Watson-Crick complement: A↔T, C↔G, N passes through.
662 #[test]
663 fn complement_bases() {
664 assert_eq!(complement(b'A'), b'T');
665 assert_eq!(complement(b'T'), b'A');
666 assert_eq!(complement(b'C'), b'G');
667 assert_eq!(complement(b'G'), b'C');
668 assert_eq!(complement(b'N'), b'N');
669 }
670
671 /// Reverse complement of a 10-base sequence.
672 #[test]
673 fn reverse_complement_sequence() {
674 let seq = b"ATCGATCGAT";
675 let rc = reverse_complement(seq);
676 assert_eq!(rc, b"ATCGATCGAT");
677
678 let seq2 = b"AACCGGTT";
679 let rc2 = reverse_complement(seq2);
680 assert_eq!(rc2, b"AACCGGTT");
681
682 // Asymmetric sequence
683 let seq3 = b"AAACCCGGGT";
684 let rc3 = reverse_complement(seq3);
685 assert_eq!(rc3, b"ACCCGGGTTT");
686
687 // complement_in_place
688 let mut buf = b"ACGT".to_vec();
689 complement_in_place(&mut buf);
690 assert_eq!(buf, b"TGCA");
691 }
692
693 /// Verify all 20 standard amino acids + Ter + Xaa three-letter codes.
694 #[test]
695 fn aa_three_letter_all_20() {
696 let cases: &[(u8, &str)] = &[
697 (b'A', "Ala"),
698 (b'C', "Cys"),
699 (b'D', "Asp"),
700 (b'E', "Glu"),
701 (b'F', "Phe"),
702 (b'G', "Gly"),
703 (b'H', "His"),
704 (b'I', "Ile"),
705 (b'K', "Lys"),
706 (b'L', "Leu"),
707 (b'M', "Met"),
708 (b'N', "Asn"),
709 (b'P', "Pro"),
710 (b'Q', "Gln"),
711 (b'R', "Arg"),
712 (b'S', "Ser"),
713 (b'T', "Thr"),
714 (b'V', "Val"),
715 (b'W', "Trp"),
716 (b'Y', "Tyr"),
717 (b'*', "Ter"),
718 (b'X', "Xaa"),
719 ];
720 for &(code, expected) in cases {
721 assert_eq!(
722 aa_three_letter(code),
723 expected,
724 "aa_three_letter({}) should be {}",
725 code as char,
726 expected,
727 );
728 }
729 }
730
731 /// VEP codon formatting: changed base at position 0 is uppercase.
732 #[test]
733 fn format_codons_position_0() {
734 assert_eq!(format_codons(b"CGT", b"TGT", 0), "Cgt/Tgt");
735 }
736
737 /// VEP codon formatting: changed base at position 1 is uppercase.
738 #[test]
739 fn format_codons_position_1() {
740 assert_eq!(format_codons(b"CGT", b"CAT", 1), "cGt/cAt");
741 }
742
743 /// VEP codon formatting: changed base at position 2 is uppercase.
744 #[test]
745 fn format_codons_position_2() {
746 assert_eq!(format_codons(b"CGT", b"CGA", 2), "cgT/cgA");
747 }
748}