umgap 1.0.0 - Docs.rs

//! The `umgap prot2tryp2lca` command.

use std::collections::HashSet;
use std::fs;
use std::io;
use std::path::PathBuf;

use fst;

use regex;

use rayon::iter::{ParallelBridge, ParallelIterator};

use crate::errors;
use crate::io::fasta;

#[derive(Debug, StructOpt)]
#[structopt(verbatim_doc_comment)]
/// Digests a FASTA stream of peptides and maps all tryptic peptides to taxon IDs
///
/// The `umgap prot2tryp2lca` command takes one or more peptides and splits these into
/// tryptic peptides, possibly filters them, and outputs their lowest common ancestors. It is a
/// combination of the `umgap prot2tryp`, `umgap filter` and `umgap pept2lca` commands to allow more
/// efficient parallel computing (c.f. their documentation for details).
///
/// The input is given in a FASTA format on *standard input* with a single peptide per FASTA header,
/// which may be hardwrapped with newlines. The command prints the lowest common ancestors for each
/// tryptic peptide found in each given peptide to standard output.
///
/// ```sh
/// $ cat input.fa
/// >header1
/// AYKKAGVSGHVWQSDGITNCLLRGLTRVKEAVANRDSGNGYINKVYYWTVDKRATTRDALDAGVDGIMTNYPDVITDVLN
/// $ umgap prot2tryp2lca tryptic-lca.index < input.fa
/// >header1
/// 571525
/// 1
/// 571525
/// 6920
/// ```
pub struct ProtToTrypToLca {
    /// Map unknown sequences to 0 instead of ignoring them
    #[structopt(short = "o", long = "one-on-one")]
    pub one_on_one: bool,

    /// An index that maps tryptic peptides to taxon IDs
    #[structopt(parse(from_os_str))]
    pub fst_file: PathBuf,

    /// Load index in memory instead of memory mapping the file contents. This
    /// makes querying significantly faster, but requires some initialization
    /// time.
    #[structopt(short = "m", long = "in-memory")]
    pub fst_in_memory: bool,

    /// Number of reads grouped into one chunk. Bigger chunks decrease
    /// the overhead caused by multithreading. Because the output order is not
    /// necessarily the same as the input order, having a chunk size which is
    /// a multiple of 12 (all 6 translations multiplied by the two paired-end
    /// reads) will keep FASTA records that originate from the same reads
    /// together.
    #[structopt(short = "c", long = "chunksize", default_value = "240")]
    pub chunk_size: usize,

    /// The cleavage-pattern (regex), i.e. the pattern after which
    /// the next peptide will be cleaved for tryptic peptides)
    #[structopt(short = "p", long = "pattern", default_value = "([KR])([^P])")]
    pub pattern: String,

    /// Minimum length of tryptic peptides to be mapped
    #[structopt(short = "l", long = "minlen", default_value = "5")]
    pub min_length: usize,

    /// Maximum length of tryptic peptides to be mapped
    #[structopt(short = "L", long = "maxlen", default_value = "50")]
    pub max_length: usize,

    /// Amino acid symbols that a peptide must contain to be processed
    #[structopt(short = "k", long = "keep", default_value = "")]
    pub contains: String,

    /// Amino acid symbols that a peptide may not contain to be processed
    #[structopt(short = "d", long = "drop", default_value = "")]
    pub lacks: String,
}

/// Implements the prot2tryp2lca command.
pub fn prot2tryp2lca(args: ProtToTrypToLca) -> errors::Result<()> {
    let fst = if args.fst_in_memory {
        let bytes = fs::read(&args.fst_file)?;
        fst::Map::from_bytes(bytes)?
    } else {
        unsafe { fst::Map::from_path(&args.fst_file) }?
    };
    let default = if args.one_on_one { Some(0) } else { None };
    let pattern = regex::Regex::new(&args.pattern)?;
    let contains = args.contains.chars().collect::<HashSet<char>>();
    let lacks = args.lacks.chars().collect::<HashSet<char>>();

    fasta::Reader::new(io::stdin(), false)
        .records()
        .chunked(args.chunk_size)
        .par_bridge()
        .map(|chunk| {
            let chunk = chunk?;
            let mut chunk_output = String::new();
            for read in chunk {
                chunk_output.push_str(&format!(">{}\n", read.header));
                for seq in read.sequence {
                    // We will run the regex replacement twice, since a letter can be
                    // matched twice (i.e. once after and once before the split).
                    let first_run = pattern.replace_all(&seq, "$1\n$2");
                    for peptide in pattern
                        .replace_all(&first_run, "$1\n$2")
                        .replace("*", "\n")
                        .lines()
                        .filter(|x| !x.is_empty())
                        .filter(|seq| {
                            let length = seq.len();
                            length >= args.min_length && length <= args.max_length
                        })
                        .filter(|seq| {
                            (contains.is_empty() && lacks.is_empty()) || {
                                let set = seq.chars().collect::<HashSet<char>>();
                                contains.intersection(&set).count() == contains.len()
                                    && lacks.intersection(&set).count() == 0
                            }
                        })
                    {
                        if let Some(lca) = fst.get(&peptide).map(Some).unwrap_or(default) {
                            chunk_output.push_str(&format!("{}\n", lca));
                        }
                    }
                }
            }
            print!("{}", chunk_output);
            Ok(())
        })
        .collect()
}