ngs/generate/
command.rs

1//! Functionality related to the `ngs generate` subcommand itself.
2
3use std::path::PathBuf;
4
5use anyhow::Context;
6use clap::ArgGroup;
7use clap::Args;
8use indicatif::ProgressBar;
9use indicatif::ProgressStyle;
10use rand::prelude::*;
11use tracing::info;
12
13use crate::generate::providers::reference_provider::ReferenceGenomeSequenceProvider;
14use crate::generate::providers::SequenceProvider;
15use crate::utils::formats;
16
17/// Utility method to parse the error rate passed in on the command line and
18/// ensure the rate is within the range [0.0, 1.0].
19pub fn error_rate_in_range(error_rate_raw: &str) -> Result<f32, String> {
20    let error_rate: f32 = error_rate_raw
21        .parse()
22        .map_err(|_| format!("{} isn't a float", error_rate_raw))?;
23
24    match (0.0..=1.0).contains(&error_rate) {
25        true => Ok(error_rate),
26        false => Err(String::from("Error rate must be between 0.0 and 1.0")),
27    }
28}
29
30/// Command line arguments for `ngs generate`.
31#[derive(Args)]
32#[command(group(ArgGroup::new("record-count").required(true).args(["coverage", "num_records"])))]
33pub struct GenerateArgs {
34    /// Destination for the FASTQ file containing all read ones.
35    read_ones_file: PathBuf,
36
37    /// Destination for the FASTQ file containing all read twos.
38    read_twos_file: PathBuf,
39
40    /// One or more reference FASTAs to generate the data based off of.
41    #[arg(required = true)] // required implies one or more
42    reference_providers: Vec<String>,
43
44    /// The error rate for the sequencer as a fraction between [0.0, 1.0] (per base).
45    #[arg(short, long, value_name = "F32", default_value = "0.0001")]
46    #[arg(value_parser = error_rate_in_range)]
47    error_rate: Option<f32>,
48
49    /// Specifies the number of records to generate.
50    #[arg(short, long, value_name = "USIZE", conflicts_with = "coverage")]
51    num_records: Option<usize>,
52
53    /// Dynamically calculate the number of reads needed for a particular mean coverage.
54    #[arg(short, long, value_name = "USIZE", conflicts_with = "num_records")]
55    coverage: Option<usize>,
56}
57
58/// Main function for the `ngs generate` subcommand.
59pub fn generate(args: GenerateArgs) -> anyhow::Result<()> {
60    // (0) Parse arguments needed for subcommand.
61    let result: anyhow::Result<Vec<_>> = args
62        .reference_providers
63        .iter()
64        .map(|provider_as_string| provider_as_string.parse::<ReferenceGenomeSequenceProvider>())
65        .collect();
66    let reference_providers = result.with_context(|| "issue parsing reference providers")?;
67
68    let reads_one_file = args.read_ones_file;
69    let reads_two_file = args.read_twos_file;
70
71    info!("Starting generate command...");
72    let mut writer_read_one = formats::fastq::writer(&reads_one_file).with_context(|| {
73        format!(
74            "Could not open reads one file: {}.",
75            reads_one_file.display()
76        )
77    })?;
78    let mut writer_read_two = formats::fastq::writer(&reads_two_file).with_context(|| {
79        format!(
80            "Could not open reads two file: {}.",
81            reads_two_file.display()
82        )
83    })?;
84
85    let mut total_reads: usize = 0;
86
87    if let Some(num_records) = args.num_records {
88        total_reads = num_records
89    } else if let Some(coverage) = args.coverage {
90        total_reads = reference_providers[0].reads_needed_for_coverage(coverage);
91    }
92
93    // (2) Set up the output writers.
94    info!("Generating {} reads...", total_reads);
95
96    // (3) Set up the progress bar.
97    let pb = ProgressBar::new(total_reads as u64);
98    pb.set_style(
99        ProgressStyle::default_bar()
100            .template("{prefix:.cyan.bold} {spinner:.green} [{elapsed_precise}] [{bar}] {pos}/{len} ({per_sec}, {eta})")
101            .progress_chars("=> "),
102    );
103    pb.set_prefix("Generating");
104
105    // (4) Generate the reads and write them to their respective files.
106    let mut rng = ThreadRng::default();
107
108    let mut i = 0;
109    while i < total_reads {
110        let selected_genome = reference_providers
111            .choose_weighted(&mut rng, |x| x.weight)
112            .unwrap();
113        let read_pair = selected_genome.generate_read_pair(
114            format!("ngs:{}", selected_genome.filename),
115            (i + 1).to_string(),
116        );
117        writer_read_one
118            .write_record(read_pair.get_forward_read())
119            .with_context(|| "Could not write record to read one file.")?;
120        writer_read_two
121            .write_record(read_pair.get_reverse_read())
122            .with_context(|| "Could not write record to read two file.")?;
123
124        if i > 0 && i % 5_000 == 0 {
125            pb.inc(5000);
126        }
127
128        i += 1;
129    }
130
131    pb.set_style(
132        ProgressStyle::default_bar()
133            .template("{prefix:.green.bold} {msg:.white.bold} [{elapsed_precise}] [{bar}] {pos}/{len} ({per_sec}, {eta})")
134            .progress_chars("=> "),
135    );
136    pb.set_prefix("✓");
137    pb.finish_with_message("Finished");
138    Ok(())
139}