Skip to main content

rsomics_fasta_translate/
lib.rs

1use std::io::{BufWriter, Write};
2use std::path::Path;
3
4use needletail::parse_fastx_file;
5use rsomics_common::{Result, RsomicsError};
6
7pub fn translate_fasta(
8    input: &Path,
9    output: &mut dyn Write,
10    frames: &[i8],
11    table: u8,
12) -> Result<u64> {
13    if std::fs::metadata(input).is_ok_and(|m| m.len() == 0) {
14        return Err(RsomicsError::InvalidInput("empty file".into()));
15    }
16
17    let mut reader = parse_fastx_file(input)
18        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
19
20    let mut out = BufWriter::with_capacity(256 * 1024, output);
21    let mut count: u64 = 0;
22
23    while let Some(record) = reader.next() {
24        let record = record.map_err(|e| RsomicsError::InvalidInput(format!("reading: {e}")))?;
25        let id = std::str::from_utf8(record.id()).unwrap_or("unknown");
26        let seq = record.seq();
27
28        for &frame in frames {
29            let (dna, frame_label) = if frame > 0 {
30                (seq.to_vec(), format!("+{frame}"))
31            } else {
32                (revcomp(&seq), format!("{frame}"))
33            };
34
35            let offset = (frame.unsigned_abs() as usize).saturating_sub(1);
36            if offset >= dna.len() {
37                continue;
38            }
39
40            let protein = translate_seq(&dna[offset..], table);
41            writeln!(out, ">{id}_frame{frame_label}").map_err(RsomicsError::Io)?;
42            out.write_all(protein.as_bytes())
43                .map_err(RsomicsError::Io)?;
44            out.write_all(b"\n").map_err(RsomicsError::Io)?;
45            count += 1;
46        }
47    }
48
49    out.flush().map_err(RsomicsError::Io)?;
50    Ok(count)
51}
52
53fn translate_seq(dna: &[u8], _table: u8) -> String {
54    let mut protein = String::with_capacity(dna.len() / 3 + 1);
55    for codon in dna.chunks(3) {
56        if codon.len() < 3 {
57            break;
58        }
59        protein.push(translate_codon(codon));
60    }
61    protein
62}
63
64fn translate_codon(codon: &[u8]) -> char {
65    let c = [
66        codon[0].to_ascii_uppercase(),
67        codon[1].to_ascii_uppercase(),
68        codon[2].to_ascii_uppercase(),
69    ];
70    match &c {
71        b"TTT" | b"TTC" => 'F',
72        b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => 'L',
73        b"ATT" | b"ATC" | b"ATA" => 'I',
74        b"ATG" => 'M',
75        b"GTT" | b"GTC" | b"GTA" | b"GTG" => 'V',
76        b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => 'S',
77        b"CCT" | b"CCC" | b"CCA" | b"CCG" => 'P',
78        b"ACT" | b"ACC" | b"ACA" | b"ACG" => 'T',
79        b"GCT" | b"GCC" | b"GCA" | b"GCG" => 'A',
80        b"TAT" | b"TAC" => 'Y',
81        b"TAA" | b"TAG" | b"TGA" => '*',
82        b"CAT" | b"CAC" => 'H',
83        b"CAA" | b"CAG" => 'Q',
84        b"AAT" | b"AAC" => 'N',
85        b"AAA" | b"AAG" => 'K',
86        b"GAT" | b"GAC" => 'D',
87        b"GAA" | b"GAG" => 'E',
88        b"TGT" | b"TGC" => 'C',
89        b"TGG" => 'W',
90        b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => 'R',
91        b"GGT" | b"GGC" | b"GGA" | b"GGG" => 'G',
92        // Degenerate codons where all IUPAC expansions encode the same amino acid.
93        // Standard IUPAC N = {A,C,G,T}; the 8 cases below are the only 4-fold
94        // degenerate patterns for the standard genetic code (table 1).
95        [b1, b2, b'N'] => {
96            match (b1, b2) {
97                (b'G', b'T') => 'V', // GTN = {GTA,GTC,GTG,GTT} all Val
98                (b'G', b'C') => 'A', // GCN = {GCA,GCC,GCG,GCT} all Ala
99                (b'G', b'G') => 'G', // GGN = {GGA,GGC,GGG,GGT} all Gly
100                (b'C', b'T') => 'L', // CTN = {CTA,CTC,CTG,CTT} all Leu
101                (b'C', b'C') => 'P', // CCN = {CCA,CCC,CCG,CCT} all Pro
102                (b'C', b'G') => 'R', // CGN = {CGA,CGC,CGG,CGT} all Arg
103                (b'A', b'C') => 'T', // ACN = {ACA,ACC,ACG,ACT} all Thr
104                (b'T', b'C') => 'S', // TCN = {TCA,TCC,TCG,TCT} all Ser
105                _ => 'X',
106            }
107        }
108        _ => 'X',
109    }
110}
111
112fn revcomp(seq: &[u8]) -> Vec<u8> {
113    seq.iter()
114        .rev()
115        .map(|&b| match b {
116            b'A' | b'a' => b'T',
117            b'T' | b't' => b'A',
118            b'C' | b'c' => b'G',
119            b'G' | b'g' => b'C',
120            _ => b'N',
121        })
122        .collect()
123}