Skip to main content

rsomics_fasta_digest/
lib.rs

1use std::io::{BufWriter, Write};
2use std::path::Path;
3
4use rsomics_common::{Result, RsomicsError};
5
6pub enum Enzyme {
7    Trypsin,
8    LysC,
9    Chymotrypsin,
10}
11
12impl Enzyme {
13    fn cleaves_after(&self, aa: u8) -> bool {
14        match self {
15            Self::Trypsin => aa == b'K' || aa == b'R',
16            Self::LysC => aa == b'K',
17            Self::Chymotrypsin => matches!(aa, b'F' | b'W' | b'Y' | b'L'),
18        }
19    }
20}
21
22pub fn digest(
23    input: &Path,
24    enzyme: &Enzyme,
25    missed_cleavages: usize,
26    min_len: usize,
27    max_len: usize,
28    output: &mut dyn Write,
29) -> Result<u64> {
30    let mut reader = needletail::parse_fastx_file(input)
31        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
32
33    let mut out = BufWriter::with_capacity(64 * 1024, output);
34    let mut peptide_count: u64 = 0;
35
36    while let Some(result) = reader.next() {
37        let record =
38            result.map_err(|e| RsomicsError::InvalidInput(format!("reading record: {e}")))?;
39        let name = std::str::from_utf8(record.id())
40            .map_err(|e| RsomicsError::InvalidInput(format!("name: {e}")))?;
41        let seq = record.seq();
42        let seq_upper: Vec<u8> = seq.iter().map(u8::to_ascii_uppercase).collect();
43
44        let mut sites: Vec<usize> = vec![0];
45        for (i, &aa) in seq_upper.iter().enumerate() {
46            if enzyme.cleaves_after(aa) {
47                sites.push(i + 1);
48            }
49        }
50        sites.push(seq_upper.len());
51
52        for i in 0..sites.len() - 1 {
53            for mc in 0..=missed_cleavages {
54                let end_idx = i + 1 + mc;
55                if end_idx >= sites.len() {
56                    break;
57                }
58                let start = sites[i];
59                let end = sites[end_idx];
60                let pep = &seq_upper[start..end];
61                if pep.len() >= min_len && pep.len() <= max_len {
62                    writeln!(
63                        out,
64                        "{name}\t{start}\t{end}\t{}",
65                        std::str::from_utf8(pep).unwrap_or("?")
66                    )
67                    .map_err(RsomicsError::Io)?;
68                    peptide_count += 1;
69                }
70            }
71        }
72    }
73
74    out.flush().map_err(RsomicsError::Io)?;
75    Ok(peptide_count)
76}