use crate::cli::{
check_path_exists, parse_compression_format, parse_fraction, parse_level, CliError, Coverage,
GenomeSize,
};
use crate::fastx::create_output_writer;
use crate::source::determine_record_source;
use crate::{Runner, SubSampler};
use anyhow::{Context, Result};
use clap::Parser;
use log::{debug, error, info, warn};
use niffler::compression;
use std::io::stdout;
use std::path::{Path, PathBuf};
#[derive(Debug, Parser)]
#[command(author, version, about)]
pub struct Reads {
#[arg(
value_parser = check_path_exists,
num_args = 1..=2,
required = true,
name = "FILE(S)"
)]
pub input: Vec<PathBuf>,
#[arg(short = 'o', long = "output", action = clap::ArgAction::Append)]
pub output: Vec<PathBuf>,
#[clap(
short,
long,
required_unless_present_any = &["bases", "num", "frac"],
requires = "coverage",
value_name = "size|faidx",
conflicts_with_all = &["num", "frac"]
)]
pub genome_size: Option<GenomeSize>,
#[clap(
short,
long,
value_name = "FLOAT",
required_unless_present_any = &["bases", "num", "frac"],
requires = "genome_size",
conflicts_with_all = &["num", "frac"]
)]
pub coverage: Option<Coverage>,
#[clap(short, long, value_name = "bases", conflicts_with_all = &["num", "frac"])]
pub bases: Option<GenomeSize>,
#[clap(short, long, value_name = "INT", conflicts_with = "frac")]
pub num: Option<GenomeSize>,
#[clap(short, long, value_name = "FLOAT", value_parser = parse_fraction, conflicts_with = "num")]
pub frac: Option<f32>,
#[clap(short = 'e', long)]
pub strict: bool,
#[clap(short = 's', long = "seed", value_name = "INT")]
pub seed: Option<u64>,
#[clap(short)]
pub verbose: bool,
#[clap(short = 'Z', long = "compress-type", value_name = "u|b|g|l|x|z", value_parser = parse_compression_format)]
pub compress_type: Option<niffler::compression::Format>,
#[clap(short = 'O', long = "output-format", value_enum)]
pub output_format: Option<crate::cli::OutputFormat>,
#[clap(short = 'l', long, value_parser = parse_level, value_name = "1-21")]
pub compress_level: Option<niffler::Level>,
}
impl Reads {
pub fn validate_input_output_combination(&self) -> std::result::Result<(), CliError> {
let out_len = self.output.len();
let in_len = self.input.len();
if in_len > 2 {
let msg = String::from("Got more than 2 files for input.");
return Err(CliError::BadInputOutputCombination(msg));
}
if out_len > 2 {
let msg = String::from("Got more than 2 files for output.");
return Err(CliError::BadInputOutputCombination(msg));
}
match in_len as isize - out_len as isize {
diff if diff == 1 && in_len == 1 => Ok(()),
diff if diff != 0 => Err(CliError::BadInputOutputCombination(format!(
"Got {in_len} --input but {out_len} --output"
))),
_ => Ok(()),
}
}
}
impl Runner for Reads {
fn run(&mut self) -> Result<()> {
self.validate_input_output_combination()?;
let is_paired = self.input.len() == 2;
if is_paired {
info!("Two input files given. Assuming paired Illumina...")
}
let input_source = determine_record_source(&self.input[0]);
let input_format = crate::alignment::infer_format_from_path(&self.input[0]);
let check_conversion = |in_fmt: Option<noodles_util::alignment::io::Format>,
out_path: Option<&std::path::PathBuf>|
-> Result<()> {
if in_fmt.is_none() {
let has_alignment_output = match &self.output_format {
Some(crate::cli::OutputFormat::Sam)
| Some(crate::cli::OutputFormat::Bam)
| Some(crate::cli::OutputFormat::Cram) => true,
_ => out_path
.map(|p| crate::alignment::infer_format_from_path(p).is_some())
.unwrap_or(false),
};
if has_alignment_output {
let out_fmt = match &self.output_format {
Some(crate::cli::OutputFormat::Sam) => {
noodles_util::alignment::io::Format::Sam
}
Some(crate::cli::OutputFormat::Bam) => {
noodles_util::alignment::io::Format::Bam
}
Some(crate::cli::OutputFormat::Cram) => {
noodles_util::alignment::io::Format::Cram
}
_ => crate::alignment::infer_format_from_path(out_path.unwrap()).unwrap(),
};
return Err(anyhow::anyhow!(
"Conversion from FASTA/FASTQ to {:?} is not supported. Please use FASTA/FASTQ output for FASTA/FASTQ input.",
out_fmt
));
}
}
Ok(())
};
check_conversion(input_format, self.output.first())?;
if is_paired {
let second_input_format = crate::alignment::infer_format_from_path(&self.input[1]);
check_conversion(second_input_format, self.output.get(1))?;
}
let mut output_handle = match self.output.len() {
0 => match self.compress_type {
None => Box::new(stdout()) as Box<dyn std::io::Write>,
Some(fmt) => {
let lvl = match fmt {
compression::Format::Gzip => compression::Level::Six,
compression::Format::Bzip => compression::Level::Nine,
compression::Format::Lzma => compression::Level::Six,
compression::Format::Zstd => compression::Level::Three,
_ => compression::Level::Zero,
};
niffler::basic::get_writer(Box::new(stdout()), fmt, lvl)?
}
},
_ => create_output_writer(&self.output[0], self.compress_level, self.compress_type)
.context("unable to create the first output file")?,
};
let target_total_bases: Option<u64> = match (self.genome_size, self.coverage, self.bases) {
(_, _, Some(bases)) => Some(u64::from(bases)),
(Some(gsize), Some(cov), _) => Some(gsize * cov),
_ => None,
};
if let Some(bases) = target_total_bases {
info!("Target number of bases to subsample to is: {bases}",);
}
info!("Gathering read lengths...");
let mut read_lengths = input_source
.read_lengths()
.context("unable to gather read lengths for the first input file")?;
if is_paired {
let second_input_source = determine_record_source(&self.input[1]);
let expected_num_reads = read_lengths.len();
info!("Gathering read lengths for second input file...");
let mate_lengths = second_input_source
.read_lengths()
.context("unable to gather read lengths for the second input file")?;
if mate_lengths.len() != expected_num_reads {
error!("First input has {} reads, but the second has {} reads. Paired Illumina files are assumed to have the same number of reads. The results of this subsample may not be as expected now.", expected_num_reads, read_lengths.len());
std::process::exit(1);
} else {
info!(
"Both input files have the same number of reads ({}) 👍",
expected_num_reads
);
}
for (i, len) in mate_lengths.iter().enumerate() {
read_lengths[i] += len;
}
}
info!("{} reads detected", read_lengths.len());
if let Some(size) = self.genome_size {
let number_of_bases: u64 = read_lengths.iter().map(|&x| x as u64).sum();
let depth_of_covg = (number_of_bases as f64) / f64::from(size);
info!("Input coverage is {:.2}x", depth_of_covg);
if self.strict
&& target_total_bases.is_some()
&& Coverage(depth_of_covg as f32) < self.coverage.unwrap()
{
return Err(anyhow::anyhow!(
"Requested coverage ({:.2}x) is not possible as the actual coverage is {:.2}x",
self.coverage.unwrap().0,
depth_of_covg
));
}
}
if let Some(req_bases) = self.bases {
let total_bases: u64 = read_lengths.iter().map(|&x| x as u64).sum();
let req_bases = u64::from(req_bases);
if self.strict && req_bases > total_bases {
return Err(anyhow::anyhow!(
"Requested number of bases ({}) is more than the input ({})",
req_bases,
total_bases
));
}
}
let num_reads = match (self.num, self.frac) {
(Some(n), None) => Some(u64::from(n)),
(None, Some(f)) => {
let n = ((f as f64) * (read_lengths.len() as f64)).round() as u64;
if n == 0 {
if !self.strict {
warn!(
"Requested fraction of reads ({} * {}) was rounded to 0",
f,
read_lengths.len()
);
} else {
return Err(anyhow::anyhow!(
"Requested fraction of reads ({} * {}) was rounded to 0",
f,
read_lengths.len()
));
}
}
Some(n)
}
_ => None,
};
if let Some(n) = num_reads {
if self.strict && n as usize > read_lengths.len() {
return Err(anyhow::anyhow!(
"Requested number of reads ({}) is more than the input ({})",
n,
read_lengths.len()
));
}
info!("Target number of reads to subsample to is: {}", n);
}
let subsampler = SubSampler {
target_total_bases,
seed: self.seed,
num_reads,
};
let (reads_to_keep, nb_reads_to_keep) = subsampler.indices(&read_lengths);
if is_paired {
info!("Keeping {} reads from each input", nb_reads_to_keep);
} else {
info!("Keeping {} reads", nb_reads_to_keep);
}
debug!("Indices of reads being kept:\n{:?}", reads_to_keep);
let output_format_1 = match &self.output_format {
Some(crate::cli::OutputFormat::Sam) => Some(noodles_util::alignment::io::Format::Sam),
Some(crate::cli::OutputFormat::Bam) => Some(noodles_util::alignment::io::Format::Bam),
Some(crate::cli::OutputFormat::Cram) => Some(noodles_util::alignment::io::Format::Cram),
Some(crate::cli::OutputFormat::Fasta) | Some(crate::cli::OutputFormat::Fastq) => None,
None => {
if self.output.is_empty() {
input_format
} else {
crate::alignment::infer_format_from_path(&self.output[0])
}
}
};
let is_fasta_1 = match self.output_format {
Some(crate::cli::OutputFormat::Fasta) => true,
Some(crate::cli::OutputFormat::Fastq) => false,
_ => {
if self.output.is_empty() {
is_fasta_path(&self.input[0])
} else {
is_fasta_path(&self.output[0])
}
}
};
let mut total_kept_bases = input_source.filter_reads_into(
&reads_to_keep,
nb_reads_to_keep,
&mut output_handle,
output_format_1,
is_fasta_1,
)? as u64;
if is_paired {
let second_input_source = determine_record_source(&self.input[1]);
let second_input_format = crate::alignment::infer_format_from_path(&self.input[1]);
let mut second_output_handle =
create_output_writer(&self.output[1], self.compress_level, self.compress_type)
.context("unable to create the second output file")?;
let output_format_2 = match &self.output_format {
Some(crate::cli::OutputFormat::Sam) => {
Some(noodles_util::alignment::io::Format::Sam)
}
Some(crate::cli::OutputFormat::Bam) => {
Some(noodles_util::alignment::io::Format::Bam)
}
Some(crate::cli::OutputFormat::Cram) => {
Some(noodles_util::alignment::io::Format::Cram)
}
Some(crate::cli::OutputFormat::Fasta) | Some(crate::cli::OutputFormat::Fastq) => {
None
}
None => {
if self.output.len() < 2 {
second_input_format
} else {
crate::alignment::infer_format_from_path(&self.output[1])
}
}
};
let is_fasta_2 = match self.output_format {
Some(crate::cli::OutputFormat::Fasta) => true,
Some(crate::cli::OutputFormat::Fastq) => false,
_ => {
if self.output.len() < 2 {
is_fasta_path(&self.input[1])
} else {
is_fasta_path(&self.output[1])
}
}
};
total_kept_bases += second_input_source.filter_reads_into(
&reads_to_keep,
nb_reads_to_keep,
&mut second_output_handle,
output_format_2,
is_fasta_2,
)? as u64;
}
if let Some(gsize) = self.genome_size {
let actual_covg = total_kept_bases / gsize;
if Coverage(actual_covg as f32) < self.coverage.unwrap() {
warn!(
"Requested coverage ({:.2}x) is not possible as the actual coverage is {:.2}x - \
output will be the same as the input",
self.coverage.unwrap().0,
actual_covg
);
} else {
info!("Actual coverage of kept reads is {:.2}x", actual_covg);
}
} else {
info!("Kept {} bases", total_kept_bases);
}
info!("Done 🎉");
Ok(())
}
}
fn is_fasta_path(path: &Path) -> bool {
let mut p = path.to_path_buf();
if let Some(ext) = p.extension().map(|e| e.to_string_lossy().to_lowercase()) {
if ["gz", "bz2", "xz", "zst", "bz"].contains(&ext.as_str()) {
p = p.with_extension("");
}
}
if let Some(ext) = p.extension().map(|e| e.to_string_lossy().to_lowercase()) {
return ext == "fasta" || ext == "fa";
}
false
}
#[cfg(test)]
mod tests {
use assert_cmd::Command;
const SUB: &str = "reads";
#[test]
fn no_args_given_raises_error() {
let passed_args = vec![SUB];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn no_input_file_given_raises_error() {
let passed_args = vec![SUB, "-c", "30", "-g", "3mb"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn no_coverage_given_raises_error() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-g", "3mb"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn invalid_coverage_given_raises_error() {
let passed_args = vec![SUB, "in.fq", "-g", "3mb", "-c", "foo"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn no_genome_size_given_raises_error() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-c", "5"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn no_genome_size_but_bases_and_coverage() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-b", "5", "-c", "7"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn bases_and_coverage_and_genome_size_all_allowed() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-b", "5", "-c", "7", "-g", "5m"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn no_genome_size_or_coverage_given_but_bases_given() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-b", "5"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn no_genome_size_or_coverage_given_but_num_given() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-n", "5"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn no_genome_size_or_coverage_given_but_frac_given() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-f", "5"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn num_and_coverage_and_genome_size_not_allowed() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-n", "5", "-c", "7", "-g", "5m"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn frac_and_coverage_and_genome_size_not_allowed() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-f", "5", "-c", "7", "-g", "5m"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn frac_and_bases_not_allowed() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-f", "5", "-b", "7"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn frac_and_num_not_allowed() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-f", "5", "-n", "7"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn bases_and_num_not_allowed() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-b", "5", "-n", "7"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn invalid_genome_size_given_raises_error() {
let passed_args = vec![SUB, "in.fq", "-c", "5", "-g", "8jb"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn faidx_given_as_genome_size() {
let infile = "tests/cases/r1.fq.gz";
let faidx = "tests/cases/h37rv.fa.fai";
let passed_args = vec![SUB, infile, "-c", "5", "-g", faidx];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn invalid_seed_given_raises_error() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-c", "5", "-g", "8mb", "-s", "foo"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn all_valid_args_parsed_as_expected() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB,
infile,
"-c",
"5",
"-g",
"8mb",
"-s",
"88",
"-o",
"/dev/null",
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn two_outputs_passed_with_one_option_flag() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB,
infile,
infile,
"-c",
"5",
"-g",
"8mb",
"-s",
"88",
"-o",
"/dev/null",
"/dev/null",
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn three_inputs_raises_error() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB,
infile,
infile,
infile,
"-c",
"5",
"-g",
"8mb",
"-s",
"88",
"-o",
"my/output/file.fq",
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn three_outputs_raises_error() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB,
infile,
infile,
"-c",
"5",
"-g",
"8mb",
"-s",
"88",
"-o",
"my/output/file.fq",
"-o",
"out.fq",
"-o",
"out.fq",
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn one_input_no_output_is_ok() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![SUB, infile, "-c", "5", "-g", "8mb", "-s", "88"];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn two_inputs_one_output_raises_error() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB, infile, infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq",
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn one_input_two_outputs_raises_error() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB, infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq", "-o", "out.fq",
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().failure();
}
#[test]
fn two_input_two_outputs_is_ok() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB, infile, infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq", "-o", "out.fq",
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
#[test]
fn two_input_two_outputs_is_ok_when_positional_args_at_end() {
let infile = "tests/cases/r1.fq.gz";
let passed_args = vec![
SUB, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq", "-o", "out.fq", infile, infile,
];
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.args(passed_args).assert().success();
}
}