rsomics_fasta_digest/
lib.rs1use std::io::{BufWriter, Write};
2use std::path::Path;
3
4use rsomics_common::{Result, RsomicsError};
5
6pub enum Enzyme {
7 Trypsin,
8 LysC,
9 Chymotrypsin,
10}
11
12impl Enzyme {
13 fn cleaves_after(&self, aa: u8) -> bool {
14 match self {
15 Self::Trypsin => aa == b'K' || aa == b'R',
16 Self::LysC => aa == b'K',
17 Self::Chymotrypsin => matches!(aa, b'F' | b'W' | b'Y' | b'L'),
18 }
19 }
20}
21
22pub fn digest(
23 input: &Path,
24 enzyme: &Enzyme,
25 missed_cleavages: usize,
26 min_len: usize,
27 max_len: usize,
28 output: &mut dyn Write,
29) -> Result<u64> {
30 let mut reader = needletail::parse_fastx_file(input)
31 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
32
33 let mut out = BufWriter::with_capacity(64 * 1024, output);
34 let mut peptide_count: u64 = 0;
35
36 while let Some(result) = reader.next() {
37 let record =
38 result.map_err(|e| RsomicsError::InvalidInput(format!("reading record: {e}")))?;
39 let name = std::str::from_utf8(record.id())
40 .map_err(|e| RsomicsError::InvalidInput(format!("name: {e}")))?;
41 let seq = record.seq();
42 let seq_upper: Vec<u8> = seq.iter().map(u8::to_ascii_uppercase).collect();
43
44 let mut sites: Vec<usize> = vec![0];
45 for (i, &aa) in seq_upper.iter().enumerate() {
46 if enzyme.cleaves_after(aa) {
47 sites.push(i + 1);
48 }
49 }
50 sites.push(seq_upper.len());
51
52 for i in 0..sites.len() - 1 {
53 for mc in 0..=missed_cleavages {
54 let end_idx = i + 1 + mc;
55 if end_idx >= sites.len() {
56 break;
57 }
58 let start = sites[i];
59 let end = sites[end_idx];
60 let pep = &seq_upper[start..end];
61 if pep.len() >= min_len && pep.len() <= max_len {
62 writeln!(
63 out,
64 "{name}\t{start}\t{end}\t{}",
65 std::str::from_utf8(pep).unwrap_or("?")
66 )
67 .map_err(RsomicsError::Io)?;
68 peptide_count += 1;
69 }
70 }
71 }
72 }
73
74 out.flush().map_err(RsomicsError::Io)?;
75 Ok(peptide_count)
76}