rsomics_fasta_translate/
lib.rs1use std::io::{BufWriter, Write};
2use std::path::Path;
3
4use needletail::parse_fastx_file;
5use rsomics_common::{Result, RsomicsError};
6
7pub fn translate_fasta(
8 input: &Path,
9 output: &mut dyn Write,
10 frames: &[i8],
11 table: u8,
12) -> Result<u64> {
13 if std::fs::metadata(input).is_ok_and(|m| m.len() == 0) {
14 return Err(RsomicsError::InvalidInput("empty file".into()));
15 }
16
17 let mut reader = parse_fastx_file(input)
18 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
19
20 let mut out = BufWriter::with_capacity(256 * 1024, output);
21 let mut count: u64 = 0;
22
23 while let Some(record) = reader.next() {
24 let record = record.map_err(|e| RsomicsError::InvalidInput(format!("reading: {e}")))?;
25 let id = std::str::from_utf8(record.id()).unwrap_or("unknown");
26 let seq = record.seq();
27
28 for &frame in frames {
29 let (dna, frame_label) = if frame > 0 {
30 (seq.to_vec(), format!("+{frame}"))
31 } else {
32 (revcomp(&seq), format!("{frame}"))
33 };
34
35 let offset = (frame.unsigned_abs() as usize).saturating_sub(1);
36 if offset >= dna.len() {
37 continue;
38 }
39
40 let protein = translate_seq(&dna[offset..], table);
41 writeln!(out, ">{id}_frame{frame_label}").map_err(RsomicsError::Io)?;
42 out.write_all(protein.as_bytes())
43 .map_err(RsomicsError::Io)?;
44 out.write_all(b"\n").map_err(RsomicsError::Io)?;
45 count += 1;
46 }
47 }
48
49 out.flush().map_err(RsomicsError::Io)?;
50 Ok(count)
51}
52
53fn translate_seq(dna: &[u8], _table: u8) -> String {
54 let mut protein = String::with_capacity(dna.len() / 3 + 1);
55 for codon in dna.chunks(3) {
56 if codon.len() < 3 {
57 break;
58 }
59 protein.push(translate_codon(codon));
60 }
61 protein
62}
63
64fn translate_codon(codon: &[u8]) -> char {
65 let c = [
66 codon[0].to_ascii_uppercase(),
67 codon[1].to_ascii_uppercase(),
68 codon[2].to_ascii_uppercase(),
69 ];
70 match &c {
71 b"TTT" | b"TTC" => 'F',
72 b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => 'L',
73 b"ATT" | b"ATC" | b"ATA" => 'I',
74 b"ATG" => 'M',
75 b"GTT" | b"GTC" | b"GTA" | b"GTG" => 'V',
76 b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => 'S',
77 b"CCT" | b"CCC" | b"CCA" | b"CCG" => 'P',
78 b"ACT" | b"ACC" | b"ACA" | b"ACG" => 'T',
79 b"GCT" | b"GCC" | b"GCA" | b"GCG" => 'A',
80 b"TAT" | b"TAC" => 'Y',
81 b"TAA" | b"TAG" | b"TGA" => '*',
82 b"CAT" | b"CAC" => 'H',
83 b"CAA" | b"CAG" => 'Q',
84 b"AAT" | b"AAC" => 'N',
85 b"AAA" | b"AAG" => 'K',
86 b"GAT" | b"GAC" => 'D',
87 b"GAA" | b"GAG" => 'E',
88 b"TGT" | b"TGC" => 'C',
89 b"TGG" => 'W',
90 b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => 'R',
91 b"GGT" | b"GGC" | b"GGA" | b"GGG" => 'G',
92 [b1, b2, b'N'] => {
96 match (b1, b2) {
97 (b'G', b'T') => 'V', (b'G', b'C') => 'A', (b'G', b'G') => 'G', (b'C', b'T') => 'L', (b'C', b'C') => 'P', (b'C', b'G') => 'R', (b'A', b'C') => 'T', (b'T', b'C') => 'S', _ => 'X',
106 }
107 }
108 _ => 'X',
109 }
110}
111
112fn revcomp(seq: &[u8]) -> Vec<u8> {
113 seq.iter()
114 .rev()
115 .map(|&b| match b {
116 b'A' | b'a' => b'T',
117 b'T' | b't' => b'A',
118 b'C' | b'c' => b'G',
119 b'G' | b'g' => b'C',
120 _ => b'N',
121 })
122 .collect()
123}