umgap/commands/
bestof.rs

1//! The `umgap bestof` command.
2
3use std::io;
4
5use crate::errors;
6use crate::io::fasta;
7use crate::taxon::TaxonId;
8
9#[derive(Debug, StructOpt)]
10#[structopt(verbatim_doc_comment)]
11/// Selects the best read of every fixed size group
12///
13/// The `umgap bestof` command takes groups of taxon IDs as input and outputs for each group the
14/// taxon ID with the most non-root identifications.
15///
16/// The input is given in FASTA format on *standard input*. Per FASTA header, there should be
17/// multiple numbers (taxon IDs). Per 6 FASTA records (or whichever number you specify with `-f`),
18/// the best record is selected and written to *standard output*. If the input is a series of
19/// identified taxon IDs for each of the 6 translations of a read, the output will most likely come
20/// from the actual coding frame.
21///
22/// ```sh
23/// $ cat dna.fa
24/// >header1
25/// CGCAGAGACGGGTAGAACCTCAGTAATCCGAAAAGCCGGGATCGACCGCCCCTTGCTTGCAGCCGGGCACTACAGGACCC
26/// $ umgap translate -n -a < dna.fa | umgap prot2kmer2lca 9mer.index | tee input.fa
27/// >header1|1
28/// 9606 9606 2759 9606 9606 9606 9606 9606 9606 9606 8287
29/// >header1|2
30/// 2026807 888268 186802 1598 1883
31/// >header1|3
32/// 1883
33/// >header1|1R
34/// 27342 2759 155619 1133106 38033 2
35/// >header1|2R
36/// >header1|3R
37/// 2951
38/// $ umgap bestof < input.fa
39/// >header1|1
40/// 9606 9606 2759 9606 9606 9606 9606 9606 9606 9606 8287
41/// ```
42///
43/// Taxon IDs are separated by newlines in the actual output, but are separated by spaces in this
44/// example.
45pub struct BestOf {
46    /// The number of frames of which to pick the best
47    #[structopt(short = "f", long = "frames", default_value = "6")]
48    pub frames: usize,
49}
50
51/// Implements the bestof command.
52pub fn bestof(args: BestOf) -> errors::Result<()> {
53    let mut writer = fasta::Writer::new(io::stdout(), "\n", false);
54
55    // Combine frames and process them
56    let mut chunk = Vec::with_capacity(args.frames);
57    for record in fasta::Reader::new(io::stdin(), false).records() {
58        let record = record?;
59        if chunk.len() < args.frames - 1 {
60            chunk.push(record);
61        } else {
62            // process chunk
63            writer.write_record_ref(
64                chunk
65                    .iter()
66                    .max_by_key(|&rec| {
67                        rec.sequence
68                            .iter()
69                            .map(|tid| tid.parse::<TaxonId>().unwrap_or(0))
70                            .filter(|&s| s != 0 && s != 1)
71                            .count()
72                    })
73                    .unwrap(),
74            )?;
75            chunk.clear();
76        }
77    }
78    Ok(())
79}