fn base_to_index(b: u8) -> Option<usize> {
match b {
b'A' => Some(0),
b'C' => Some(1),
b'G' => Some(2),
b'T' => Some(3),
_ => None,
}
}
fn codon_to_index(codon: &[u8; 3]) -> Option<usize> {
let a = base_to_index(codon[0])?;
let b = base_to_index(codon[1])?;
let c = base_to_index(codon[2])?;
Some(a * 16 + b * 4 + c)
}
static STANDARD_TABLE: [u8; 64] = [
b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'R', b'S', b'R', b'S', b'I', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P', b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L', b'E', b'D', b'E', b'D', b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V', b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'*', b'C', b'W', b'C', b'L', b'F', b'L', b'F', ];
static MITO_TABLE: [u8; 64] = [
b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'*', b'S', b'*', b'S', b'M', b'I', b'M', b'I', b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P', b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L', b'E', b'D', b'E', b'D', b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V', b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'W', b'C', b'W', b'C', b'L', b'F', b'L', b'F', ];
pub fn translate_codon(codon: &[u8; 3]) -> u8 {
match codon_to_index(codon) {
Some(idx) => STANDARD_TABLE[idx],
None => b'X',
}
}
pub fn translate_codon_mito(codon: &[u8; 3]) -> u8 {
match codon_to_index(codon) {
Some(idx) => MITO_TABLE[idx],
None => b'X',
}
}
pub fn translate_codon_for_transcript(codon: &[u8; 3], is_mitochondrial: bool) -> u8 {
if is_mitochondrial {
translate_codon_mito(codon)
} else {
translate_codon(codon)
}
}
pub fn complement(base: u8) -> u8 {
match base {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
other => other,
}
}
pub fn complement_in_place(seq: &mut [u8]) {
for base in seq.iter_mut() {
*base = complement(*base);
}
}
pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter().rev().map(|&b| complement(b)).collect()
}
pub fn aa_three_letter(one_letter: u8) -> &'static str {
match one_letter {
b'A' => "Ala",
b'C' => "Cys",
b'D' => "Asp",
b'E' => "Glu",
b'F' => "Phe",
b'G' => "Gly",
b'H' => "His",
b'I' => "Ile",
b'K' => "Lys",
b'L' => "Leu",
b'M' => "Met",
b'N' => "Asn",
b'P' => "Pro",
b'Q' => "Gln",
b'R' => "Arg",
b'S' => "Ser",
b'T' => "Thr",
b'V' => "Val",
b'W' => "Trp",
b'Y' => "Tyr",
b'*' => "Ter",
_ => "Xaa",
}
}
pub fn format_codons(ref_codon: &[u8; 3], alt_codon: &[u8; 3], changed_pos: u8) -> String {
debug_assert!(changed_pos < 3, "codon position must be 0, 1, or 2");
let mut result = String::with_capacity(7); for i in 0..3u8 {
if i == changed_pos {
result.push(ref_codon[i as usize] as char);
} else {
result.push((ref_codon[i as usize] as char).to_ascii_lowercase());
}
}
result.push('/');
for i in 0..3u8 {
if i == changed_pos {
result.push(alt_codon[i as usize] as char);
} else {
result.push((alt_codon[i as usize] as char).to_ascii_lowercase());
}
}
result
}
pub fn format_amino_acids(ref_aa: u8, alt_aa: u8) -> String {
if ref_aa == alt_aa {
String::from(ref_aa as char)
} else {
format!("{}/{}", ref_aa as char, alt_aa as char)
}
}
pub fn translate_sequence(
seq: &[u8],
is_mitochondrial: bool,
) -> Result<Vec<u8>, crate::VarEffectError> {
if !seq.len().is_multiple_of(3) {
return Err(crate::VarEffectError::Malformed(format!(
"translate_sequence: sequence length {} is not divisible by 3",
seq.len(),
)));
}
let mut aas = Vec::with_capacity(seq.len() / 3);
for codon_bytes in seq.chunks_exact(3) {
let codon: &[u8; 3] = codon_bytes
.try_into()
.expect("chunks_exact(3) always yields a 3-byte slice");
aas.push(translate_codon_for_transcript(codon, is_mitochondrial));
}
Ok(aas)
}
pub fn format_codons_indel(
ref_seq: &[u8],
alt_seq: &[u8],
changed_start: usize,
changed_end: usize,
) -> String {
let mut result = String::with_capacity(ref_seq.len() + alt_seq.len() + 2);
if ref_seq.is_empty() {
result.push('-');
} else {
for (i, &b) in ref_seq.iter().enumerate() {
if i >= changed_start && i < changed_end {
result.push(b as char);
} else {
result.push((b as char).to_ascii_lowercase());
}
}
}
result.push('/');
if alt_seq.is_empty() {
result.push('-');
} else {
let prefix_len = changed_start;
let suffix_len = ref_seq.len() - changed_end;
let alt_suffix_start = alt_seq.len().saturating_sub(suffix_len);
for (i, &b) in alt_seq.iter().enumerate() {
if i < prefix_len || i >= alt_suffix_start {
result.push((b as char).to_ascii_lowercase());
} else {
result.push(b as char);
}
}
}
result
}
pub fn format_amino_acids_indel(ref_aas: &[u8], alt_aas: &[u8]) -> String {
if ref_aas == alt_aas {
ref_aas.iter().map(|&b| b as char).collect()
} else {
let mut result = String::with_capacity(ref_aas.len() + alt_aas.len() + 2);
if ref_aas.is_empty() {
result.push('-');
} else {
for &b in ref_aas {
result.push(b as char);
}
}
result.push('/');
if alt_aas.is_empty() {
result.push('-');
} else {
for &b in alt_aas {
result.push(b as char);
}
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn translate_all_64_codons() {
let expected: &[(&[u8; 3], u8)] = &[
(b"TTT", b'F'),
(b"TTC", b'F'),
(b"TTA", b'L'),
(b"TTG", b'L'),
(b"TCT", b'S'),
(b"TCC", b'S'),
(b"TCA", b'S'),
(b"TCG", b'S'),
(b"TAT", b'Y'),
(b"TAC", b'Y'),
(b"TAA", b'*'),
(b"TAG", b'*'),
(b"TGT", b'C'),
(b"TGC", b'C'),
(b"TGA", b'*'),
(b"TGG", b'W'),
(b"CTT", b'L'),
(b"CTC", b'L'),
(b"CTA", b'L'),
(b"CTG", b'L'),
(b"CCT", b'P'),
(b"CCC", b'P'),
(b"CCA", b'P'),
(b"CCG", b'P'),
(b"CAT", b'H'),
(b"CAC", b'H'),
(b"CAA", b'Q'),
(b"CAG", b'Q'),
(b"CGT", b'R'),
(b"CGC", b'R'),
(b"CGA", b'R'),
(b"CGG", b'R'),
(b"ATT", b'I'),
(b"ATC", b'I'),
(b"ATA", b'I'),
(b"ATG", b'M'),
(b"ACT", b'T'),
(b"ACC", b'T'),
(b"ACA", b'T'),
(b"ACG", b'T'),
(b"AAT", b'N'),
(b"AAC", b'N'),
(b"AAA", b'K'),
(b"AAG", b'K'),
(b"AGT", b'S'),
(b"AGC", b'S'),
(b"AGA", b'R'),
(b"AGG", b'R'),
(b"GTT", b'V'),
(b"GTC", b'V'),
(b"GTA", b'V'),
(b"GTG", b'V'),
(b"GCT", b'A'),
(b"GCC", b'A'),
(b"GCA", b'A'),
(b"GCG", b'A'),
(b"GAT", b'D'),
(b"GAC", b'D'),
(b"GAA", b'E'),
(b"GAG", b'E'),
(b"GGT", b'G'),
(b"GGC", b'G'),
(b"GGA", b'G'),
(b"GGG", b'G'),
];
assert_eq!(expected.len(), 64);
for &(codon, aa) in expected {
assert_eq!(
translate_codon(codon),
aa,
"codon {} should translate to {} but got {}",
std::str::from_utf8(codon).unwrap(),
aa as char,
translate_codon(codon) as char,
);
}
}
#[test]
fn translate_stop_codons() {
assert_eq!(translate_codon(b"TAA"), b'*');
assert_eq!(translate_codon(b"TAG"), b'*');
assert_eq!(translate_codon(b"TGA"), b'*');
}
#[test]
fn translate_ambiguous_codon() {
assert_eq!(translate_codon(b"NNN"), b'X');
assert_eq!(translate_codon(b"ANG"), b'X');
assert_eq!(translate_codon(b"ATN"), b'X');
assert_eq!(translate_codon(b"atg"), b'X');
}
#[test]
fn translate_mitochondrial_differences() {
assert_eq!(translate_codon(b"TGA"), b'*');
assert_eq!(translate_codon_mito(b"TGA"), b'W');
assert_eq!(translate_codon(b"AGA"), b'R');
assert_eq!(translate_codon_mito(b"AGA"), b'*');
assert_eq!(translate_codon(b"AGG"), b'R');
assert_eq!(translate_codon_mito(b"AGG"), b'*');
assert_eq!(translate_codon(b"ATA"), b'I');
assert_eq!(translate_codon_mito(b"ATA"), b'M');
assert_eq!(translate_codon_for_transcript(b"TGA", false), b'*');
assert_eq!(translate_codon_for_transcript(b"TGA", true), b'W');
}
#[test]
fn complement_bases() {
assert_eq!(complement(b'A'), b'T');
assert_eq!(complement(b'T'), b'A');
assert_eq!(complement(b'C'), b'G');
assert_eq!(complement(b'G'), b'C');
assert_eq!(complement(b'N'), b'N');
}
#[test]
fn reverse_complement_sequence() {
let seq = b"ATCGATCGAT";
let rc = reverse_complement(seq);
assert_eq!(rc, b"ATCGATCGAT");
let seq2 = b"AACCGGTT";
let rc2 = reverse_complement(seq2);
assert_eq!(rc2, b"AACCGGTT");
let seq3 = b"AAACCCGGGT";
let rc3 = reverse_complement(seq3);
assert_eq!(rc3, b"ACCCGGGTTT");
let mut buf = b"ACGT".to_vec();
complement_in_place(&mut buf);
assert_eq!(buf, b"TGCA");
}
#[test]
fn aa_three_letter_all_20() {
let cases: &[(u8, &str)] = &[
(b'A', "Ala"),
(b'C', "Cys"),
(b'D', "Asp"),
(b'E', "Glu"),
(b'F', "Phe"),
(b'G', "Gly"),
(b'H', "His"),
(b'I', "Ile"),
(b'K', "Lys"),
(b'L', "Leu"),
(b'M', "Met"),
(b'N', "Asn"),
(b'P', "Pro"),
(b'Q', "Gln"),
(b'R', "Arg"),
(b'S', "Ser"),
(b'T', "Thr"),
(b'V', "Val"),
(b'W', "Trp"),
(b'Y', "Tyr"),
(b'*', "Ter"),
(b'X', "Xaa"),
];
for &(code, expected) in cases {
assert_eq!(
aa_three_letter(code),
expected,
"aa_three_letter({}) should be {}",
code as char,
expected,
);
}
}
#[test]
fn format_codons_position_0() {
assert_eq!(format_codons(b"CGT", b"TGT", 0), "Cgt/Tgt");
}
#[test]
fn format_codons_position_1() {
assert_eq!(format_codons(b"CGT", b"CAT", 1), "cGt/cAt");
}
#[test]
fn format_codons_position_2() {
assert_eq!(format_codons(b"CGT", b"CGA", 2), "cgT/cgA");
}
}