rsomics-fasta-digest 0.1.0

In-silico protein digestion — trypsin/LysC/other enzymes, missed cleavages, peptide mass filter
Documentation
use std::io::{BufWriter, Write};
use std::path::Path;

use rsomics_common::{Result, RsomicsError};

pub enum Enzyme {
    Trypsin,
    LysC,
    Chymotrypsin,
}

impl Enzyme {
    fn cleaves_after(&self, aa: u8) -> bool {
        match self {
            Self::Trypsin => aa == b'K' || aa == b'R',
            Self::LysC => aa == b'K',
            Self::Chymotrypsin => matches!(aa, b'F' | b'W' | b'Y' | b'L'),
        }
    }
}

pub fn digest(
    input: &Path,
    enzyme: &Enzyme,
    missed_cleavages: usize,
    min_len: usize,
    max_len: usize,
    output: &mut dyn Write,
) -> Result<u64> {
    let mut reader = needletail::parse_fastx_file(input)
        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;

    let mut out = BufWriter::with_capacity(64 * 1024, output);
    let mut peptide_count: u64 = 0;

    while let Some(result) = reader.next() {
        let record =
            result.map_err(|e| RsomicsError::InvalidInput(format!("reading record: {e}")))?;
        let name = std::str::from_utf8(record.id())
            .map_err(|e| RsomicsError::InvalidInput(format!("name: {e}")))?;
        let seq = record.seq();
        let seq_upper: Vec<u8> = seq.iter().map(u8::to_ascii_uppercase).collect();

        let mut sites: Vec<usize> = vec![0];
        for (i, &aa) in seq_upper.iter().enumerate() {
            if enzyme.cleaves_after(aa) {
                sites.push(i + 1);
            }
        }
        sites.push(seq_upper.len());

        for i in 0..sites.len() - 1 {
            for mc in 0..=missed_cleavages {
                let end_idx = i + 1 + mc;
                if end_idx >= sites.len() {
                    break;
                }
                let start = sites[i];
                let end = sites[end_idx];
                let pep = &seq_upper[start..end];
                if pep.len() >= min_len && pep.len() <= max_len {
                    writeln!(
                        out,
                        "{name}\t{start}\t{end}\t{}",
                        std::str::from_utf8(pep).unwrap_or("?")
                    )
                    .map_err(RsomicsError::Io)?;
                    peptide_count += 1;
                }
            }
        }
    }

    out.flush().map_err(RsomicsError::Io)?;
    Ok(peptide_count)
}