#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
use ahash::AHashSet;
use anyhow::{Context, Result, bail, ensure};
use bio::data_structures::bitenc::BitEnc;
use clap::builder::styling;
use clap::{ColorChoice, Parser as ClapParser, ValueEnum};
use env_logger::Env;
use flate2::bufread::MultiGzDecoder;
use fqgrep_lib::matcher::{Matcher, MatcherFactory, MatcherOpts, validate_fixed_pattern};
use fqgrep_lib::seq_io::{
InterleavedFastqReader, PairedFastqReader, ordered_parallel_fastq,
ordered_parallel_interleaved_fastq, ordered_parallel_paired_fastq, parallel_interleaved_fastq,
parallel_paired_fastq,
};
use fqgrep_lib::{is_fastq_path, is_gzip_path};
use gzp::BUFSIZE;
use isatty::stdout_isatty;
use proglog::{CountFormatterKind, ProgLog, ProgLogBuilder};
use seq_io::fastq::{self, Record, RefRecord};
use seq_io::parallel::parallel_fastq;
use std::process::ExitCode;
use std::{
fs::File,
io::{BufRead, BufReader, BufWriter, Read, Write},
path::PathBuf,
str::FromStr,
sync::LazyLock,
};
pub static NUM_CPU: LazyLock<String> = LazyLock::new(|| num_cpus::get().to_string());
pub mod built_info {
use std::sync::LazyLock;
include!(concat!(env!("OUT_DIR"), "/built.rs"));
fn get_software_version() -> String {
let prefix = if let Some(s) = GIT_COMMIT_HASH {
format!("{}-{}", PKG_VERSION, s[0..8].to_owned())
} else {
PKG_VERSION.to_string()
};
let suffix = match GIT_DIRTY {
Some(true) => "-dirty",
_ => "",
};
format!("{}{}", prefix, suffix)
}
pub static VERSION: LazyLock<String> = LazyLock::new(get_software_version);
}
fn spawn_reader(
file: PathBuf,
decompress: bool,
) -> Result<fastq::Reader<Box<dyn std::io::Read + Send>>> {
let raw_handle = if file.as_os_str() == "-" {
Box::new(std::io::stdin()) as Box<dyn Read + Send>
} else {
let handle = File::open(&file)
.with_context(|| format!("Error opening input: {}", file.display()))?;
Box::new(handle) as Box<dyn Read + Send>
};
let buf_handle: BufReader<Box<dyn std::io::Read + Send>> =
BufReader::with_capacity(BUFSIZE, raw_handle);
let maybe_decoder_handle: Box<dyn std::io::Read + Send> = {
let is_gzip = is_gzip_path(&file) || (!is_fastq_path(&file) && decompress);
if is_gzip {
Box::new(MultiGzDecoder::new(buf_handle)) as Box<dyn std::io::Read + Send>
} else {
Box::new(buf_handle) as Box<dyn std::io::Read + Send>
}
};
Ok(fastq::Reader::with_capacity(maybe_decoder_handle, BUFSIZE))
}
#[derive(ValueEnum, PartialEq, Debug, Clone)]
enum Color {
Never,
Always,
Auto,
}
#[derive(ValueEnum, PartialEq, Debug, Clone, Default)]
enum IupacOption {
#[default]
Never,
Expand,
Regex,
BitMask,
}
const STYLES: styling::Styles = styling::Styles::styled()
.header(styling::AnsiColor::Yellow.on_default().bold())
.usage(styling::AnsiColor::Yellow.on_default().bold())
.literal(styling::AnsiColor::Blue.on_default().bold())
.placeholder(styling::AnsiColor::Cyan.on_default());
#[derive(ClapParser, Debug, Clone)]
#[clap(
name = "fqgrep",
color = ColorChoice::Auto,
styles = STYLES,
version = built_info::VERSION.as_str())
]
#[allow(clippy::struct_excessive_bools)]
struct Opts {
#[clap(long, short = 't', default_value = NUM_CPU.as_str())]
threads: usize,
#[clap(long = "color", default_value = "never")]
color: Color,
#[clap(long, short = 'c')]
count: bool,
#[clap(long, short = 'e')]
regexp: Vec<String>,
#[clap(long, short = 'F')]
fixed_strings: bool,
#[clap(long, short = 'f')]
file: Option<PathBuf>,
#[clap(long = "read-names-file", short = 'N', conflicts_with_all = ["regexp", "file", "fixed_strings"])]
read_names_file: Option<PathBuf>,
#[clap(short = 'v')]
invert_match: bool,
#[clap(short = 'Z', long)]
decompress: bool,
#[clap(long)]
paired: bool,
#[clap(long)]
no_order: bool,
#[clap(long)]
reverse_complement: bool,
#[clap(long)]
progress: bool,
#[clap(long, conflicts_with = "reverse_complement")]
protein: bool,
#[clap(long, default_value = "never", conflicts_with = "protein")]
iupac: IupacOption,
args: Vec<String>,
#[clap(long, hide_short_help = true, hide_long_help = true)]
output: Vec<PathBuf>,
}
fn read_patterns(file: &PathBuf) -> Result<Vec<String>> {
let f = File::open(file)
.with_context(|| format!("Error opening pattern file: {}", file.display()))?;
let r = BufReader::new(f);
let mut v = Vec::new();
for result in r.lines() {
v.push(result?);
}
Ok(v)
}
fn read_query_names(file: &PathBuf) -> Result<AHashSet<Vec<u8>>> {
let f = File::open(file)
.with_context(|| format!("Error opening query names file: {}", file.display()))?;
let r = BufReader::new(f);
let mut names = AHashSet::new();
for result in r.lines() {
let line = result?;
let trimmed = line.trim();
if !trimmed.is_empty() {
names.insert(trimmed.as_bytes().to_vec());
}
}
Ok(names)
}
fn main() -> ExitCode {
if let Some(fqgrep_output) = fqgrep(&setup()) {
ExitCode::from(fqgrep_output)
} else {
ExitCode::SUCCESS
}
}
fn fqgrep(opts: &Opts) -> Option<u8> {
let outer = std::panic::catch_unwind(|| fqgrep_from_opts(opts));
match outer {
Err(_) => {
eprintln!("Error: fqgrep panicked. Please report this as a bug!");
Some(101)
}
Ok(inner) => match inner {
Ok(0) => Some(1),
Ok(_) => None,
Err(e) => {
eprintln!("Error: {}", e);
Some(2)
}
},
}
}
fn expand_iupac(
pattern: &mut Option<String>,
regexp: &mut Vec<String>,
fixed_strings: &mut bool,
iupac: &IupacOption,
) -> Result<Option<Vec<BitEnc>>> {
use fqgrep_lib::{encode, expand_iupac_fixed_pattern, expand_iupac_regex};
if *iupac == IupacOption::Never {
return Ok(None);
}
if !*fixed_strings {
eprintln!("Warning: --iupac is ignored without --fixed-strings (-F)");
return Ok(None);
}
let patterns: Vec<String> = if let Some(p) = pattern.take() {
vec![p]
} else {
std::mem::take(regexp)
};
match iupac {
IupacOption::Expand => {
let mut expanded = Vec::new();
for p in &patterns {
let mut local = Vec::new();
expand_iupac_fixed_pattern(p.as_bytes(), 0, &mut Vec::new(), &mut local)
.map_err(|e| anyhow::anyhow!(e))?;
for e in local {
expanded.push(String::from_utf8(e).unwrap());
}
}
*regexp = expanded;
Ok(None)
}
IupacOption::Regex => {
let mut expanded = Vec::new();
for p in &patterns {
expanded.push(String::from_utf8(expand_iupac_regex(p.as_bytes())).unwrap());
}
*regexp = expanded;
*fixed_strings = false;
Ok(None)
}
IupacOption::BitMask => {
let bitencs: Vec<BitEnc> = patterns.iter().map(|p| encode(p.as_bytes())).collect();
if bitencs.len() == 1 {
*pattern = Some(patterns.into_iter().next().unwrap());
} else {
*regexp = patterns;
}
Ok(Some(bitencs))
}
IupacOption::Never => unreachable!(),
}
}
#[allow(clippy::too_many_lines)]
fn fqgrep_from_opts(opts: &Opts) -> Result<usize> {
let mut opts = opts.clone();
let query_names: Option<AHashSet<Vec<u8>>> = if let Some(file) = &opts.read_names_file {
let names = read_query_names(file)?;
if opts.reverse_complement {
eprintln!(
"Warning: --reverse-complement is ignored when using query name matching (-N)"
);
}
Some(names)
} else {
if let Some(file) = &opts.file {
for pattern in read_patterns(file)? {
opts.regexp.push(pattern);
}
}
None
};
let (mut pattern, mut files): (Option<String>, Vec<PathBuf>) = {
let (pattern, file_strings): (Option<String>, Vec<String>) =
if query_names.is_some() || opts.read_names_file.is_some() || !opts.regexp.is_empty() {
(None, opts.args.clone())
} else {
ensure!(
!opts.args.is_empty(),
"Pattern must be given with -e or as the first positional argument "
);
let files = opts.args.iter().skip(1).cloned().collect();
(Some(opts.args[0].clone()), files)
};
let files = file_strings
.iter()
.map(|s| {
PathBuf::from_str(s)
.with_context(|| format!("Cannot create a path from: {}", s))
.unwrap()
})
.collect();
(pattern, files)
};
let bitencs = expand_iupac(
&mut pattern,
&mut opts.regexp,
&mut opts.fixed_strings,
&opts.iupac,
)?;
if opts.paired {
ensure!(
files.len() <= 1 || files.len() % 2 == 0,
"Input files must be a multiple of two, or either a single file or standard input (assume interleaved) with --paired"
);
}
let paired_output = opts.output.len() == 2;
ensure!(
opts.output.len() <= 2,
"Expected at most 2 output files, got {}",
opts.output.len()
);
if paired_output {
ensure!(
opts.paired,
"Two output files may only be given with --paired"
);
}
if query_names.is_none() && opts.fixed_strings && bitencs.is_none() {
if let Some(pattern) = &pattern {
validate_fixed_pattern(pattern, opts.protein)?;
} else if !opts.regexp.is_empty() {
for pattern in &opts.regexp {
validate_fixed_pattern(pattern, opts.protein)?;
}
} else {
bail!("A pattern must be given as a positional argument or with -e/--regexp")
}
}
let progress_logger = if opts.progress {
Some(
ProgLogBuilder::new()
.name("fqgrep-progress")
.noun("reads")
.verb("Searched")
.unit(50_000_000)
.count_formatter(CountFormatterKind::Comma)
.build(),
)
} else {
None
};
let match_opts = MatcherOpts {
invert_match: opts.invert_match,
reverse_complement: opts.reverse_complement,
};
let color = query_names.is_none()
&& (opts.color == Color::Always || (opts.color == Color::Auto && stdout_isatty()));
type OptWriter = Option<Box<dyn Write>>;
let (mut r1_writer, mut r2_writer): (OptWriter, OptWriter) = {
if opts.count {
(None, None)
} else if paired_output {
let w1 = Box::new(BufWriter::with_capacity(
BUFSIZE,
File::create(&opts.output[0]).with_context(|| {
format!("Error creating output: {}", opts.output[0].display())
})?,
));
let w2 = Box::new(BufWriter::with_capacity(
BUFSIZE,
File::create(&opts.output[1]).with_context(|| {
format!("Error creating output: {}", opts.output[1].display())
})?,
));
(Some(w1), Some(w2))
} else if opts.output.len() == 1 {
let w = Box::new(BufWriter::with_capacity(
BUFSIZE,
File::create(&opts.output[0]).with_context(|| {
format!("Error creating output: {}", opts.output[0].display())
})?,
));
(Some(w), None)
} else {
(
Some(Box::new(BufWriter::with_capacity(
BUFSIZE,
std::io::stdout(),
))),
None,
)
}
};
let mut num_matches = 0usize;
let matcher: Box<dyn Matcher + Sync + Send> = if let Some(names) = query_names {
MatcherFactory::new_query_name_matcher(names, match_opts)
} else {
MatcherFactory::new_matcher(
&pattern,
opts.fixed_strings,
&opts.regexp,
bitencs,
match_opts,
)?
};
if files.is_empty() {
files.push(PathBuf::from_str("-").unwrap());
}
if opts.paired {
if files.len() == 1 {
let reader = InterleavedFastqReader::new(spawn_reader(
files.first().unwrap().clone(),
opts.decompress,
)?);
let work = |(read1, read2): (RefRecord, RefRecord), found: &mut u32| {
*found = process_paired_reads(&read1, &read2, &matcher, &progress_logger);
};
let func = |(read1, read2): (RefRecord, RefRecord), found: &mut u32| {
if *found > 0 {
num_matches += 1;
write_paired_record(
&read1,
&read2,
*found,
color,
&matcher,
&mut r1_writer,
&mut r2_writer,
);
}
None::<()>
};
if opts.no_order || opts.count {
parallel_interleaved_fastq(reader, opts.threads as u32, opts.threads, work, func)
.unwrap();
} else {
ordered_parallel_interleaved_fastq(
reader,
opts.threads as u32,
opts.threads,
work,
func,
)
.unwrap();
}
} else {
for file_pairs in files.chunks_exact(2) {
let reader1: fastq::Reader<Box<dyn Read + Send>> =
spawn_reader(file_pairs[0].clone(), opts.decompress)?;
let reader2: fastq::Reader<Box<dyn Read + Send>> =
spawn_reader(file_pairs[1].clone(), opts.decompress)?;
let reader = PairedFastqReader::new(reader1, reader2);
let work = |(read1, read2): (RefRecord, RefRecord), found: &mut u32| {
*found = process_paired_reads(&read1, &read2, &matcher, &progress_logger);
};
let func = |(read1, read2): (RefRecord, RefRecord), found: &mut u32| {
if *found > 0 {
num_matches += 1;
write_paired_record(
&read1,
&read2,
*found,
color,
&matcher,
&mut r1_writer,
&mut r2_writer,
);
}
None::<()>
};
if opts.no_order || opts.count {
parallel_paired_fastq(reader, opts.threads as u32, opts.threads, work, func)
.unwrap();
} else {
ordered_parallel_paired_fastq(
reader,
opts.threads as u32,
opts.threads,
work,
func,
)
.unwrap();
}
}
}
} else {
for file in files {
let reader: fastq::Reader<Box<dyn Read + Send>> =
spawn_reader(file.clone(), opts.decompress)?;
let work = |record: RefRecord, found: &mut bool| {
if let Some(progress) = &progress_logger {
progress.record();
}
*found = matcher.read_match(&record);
};
let func = |record: RefRecord, found: &mut bool| {
if *found {
num_matches += 1;
if let Some(writer) = &mut r1_writer {
if color {
let mut record = record.to_owned_record();
matcher.color(&mut record, *found);
fastq::write_to(writer, record.head(), record.seq(), record.qual())
.unwrap();
} else {
fastq::write_to(writer, record.head(), record.seq(), record.qual())
.unwrap();
}
}
}
None::<()>
};
if opts.no_order || opts.count {
parallel_fastq(reader, opts.threads as u32, opts.threads, work, func).unwrap();
} else {
ordered_parallel_fastq(reader, opts.threads as u32, opts.threads, work, func)
.unwrap();
}
}
}
if opts.count {
std::io::stdout()
.write_all(format!("{num_matches}\n").as_bytes())
.unwrap();
}
Ok(num_matches)
}
#[allow(clippy::borrowed_box)]
fn write_paired_record(
read1: &RefRecord,
read2: &RefRecord,
found: u32,
color: bool,
matcher: &Box<dyn Matcher + Sync + Send>,
r1_writer: &mut Option<Box<dyn Write>>,
r2_writer: &mut Option<Box<dyn Write>>,
) {
let (w1, w2) = if r2_writer.is_some() {
let w1 = r1_writer.as_mut();
let w2 = r2_writer.as_mut();
(w1, w2)
} else {
(r1_writer.as_mut(), None)
};
if let Some(w1) = w1 {
if color {
let found1 = found == 1;
let found2 = found == 2 || matcher.read_match(read2);
let mut read1 = read1.to_owned_record();
let mut read2 = read2.to_owned_record();
matcher.color(&mut read1, found1);
matcher.color(&mut read2, found2);
fastq::write_to(&mut *w1, read1.head(), read1.seq(), read1.qual()).unwrap();
if let Some(w2) = w2 {
fastq::write_to(&mut *w2, read2.head(), read2.seq(), read2.qual()).unwrap();
} else {
fastq::write_to(&mut *w1, read2.head(), read2.seq(), read2.qual()).unwrap();
}
} else {
fastq::write_to(&mut *w1, read1.head(), read1.seq(), read1.qual()).unwrap();
if let Some(w2) = w2 {
fastq::write_to(&mut *w2, read2.head(), read2.seq(), read2.qual()).unwrap();
} else {
fastq::write_to(&mut *w1, read2.head(), read2.seq(), read2.qual()).unwrap();
}
}
}
}
#[allow(clippy::ref_option)] #[allow(clippy::borrowed_box)] fn process_paired_reads(
read1: &RefRecord,
read2: &RefRecord,
matcher: &Box<dyn Matcher + Sync + Send>, progress_logger: &Option<ProgLog>,
) -> u32 {
if let Some(progress) = progress_logger {
progress.record();
progress.record();
}
assert_eq!(
read1.id_bytes(),
read2.id_bytes(),
"Mismatching read pair! R1: {} R2: {}",
read1.id().unwrap(),
read2.id().unwrap()
);
if matcher.read_match(read1) {
1
} else if matcher.read_match(read2) {
2
} else {
0
}
}
fn setup() -> Opts {
if std::env::var("RUST_LOG").is_err() {
unsafe {
std::env::set_var("RUST_LOG", "info");
}
}
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
Opts::parse()
}
#[cfg(test)]
pub mod tests {
use crate::*;
use fgoxide::io::Io;
use rstest::rstest;
use seq_io::fastq::{OwnedRecord, Record};
use tempfile::TempDir;
fn write_fastq(
temp_dir: &TempDir,
sequences_per_fastq: &Vec<Vec<&str>>,
file_extension: String,
) -> Vec<String> {
let mut fastq_paths = Vec::new();
for (fastq_index, fastq_sequences) in sequences_per_fastq.iter().enumerate() {
let name = format!("sample_{fastq_index}{file_extension}");
let fastq_path = temp_dir.path().join(name);
let io = Io::default();
let mut writer = io.new_writer(&fastq_path).unwrap();
for (num, seq) in fastq_sequences.iter().enumerate() {
let read = OwnedRecord {
head: format!("@{num}").as_bytes().to_vec(),
seq: seq.as_bytes().to_vec(),
qual: vec![b'X'; seq.len()],
};
fastq::write_to(&mut writer, &read.head, &read.seq, &read.qual)
.expect("failed writing read");
}
let path_as_string = fastq_path.as_path().display().to_string();
fastq_paths.push(path_as_string);
}
fastq_paths
}
fn write_pattern(temp_dir: &TempDir, pattern: &Vec<String>) -> PathBuf {
let io = Io::default();
let name = String::from("pattern.txt");
let pattern_path = temp_dir.path().join(name);
io.write_lines(&pattern_path, pattern).unwrap();
pattern_path
}
fn build_opts(
dir: &TempDir,
seqs: &Vec<Vec<&str>>,
regexp: &Vec<String>,
pattern_from_file: bool,
output: Vec<PathBuf>,
compression: String,
) -> Opts {
let fq_path = write_fastq(dir, seqs, compression);
let (pattern_string, pattern_file) = {
if pattern_from_file {
(vec![], Some(write_pattern(dir, regexp)))
} else {
(regexp.clone(), None)
}
};
Opts {
threads: 4,
color: Color::Never,
count: output.is_empty(),
regexp: pattern_string,
fixed_strings: false,
file: pattern_file,
read_names_file: None,
invert_match: false,
decompress: false,
paired: false,
no_order: false,
reverse_complement: false,
progress: true,
protein: false,
iupac: IupacOption::Never,
args: fq_path.clone(),
output,
}
}
fn slurp_output(result_path: PathBuf) -> Vec<String> {
let mut return_seqs = vec![];
let mut reader = fastq::Reader::from_path(result_path).unwrap();
while let Some(record) = reader.next() {
let record = record.expect("Error reading record");
let seq = std::str::from_utf8(record.seq()).unwrap();
return_seqs.push(seq.to_owned());
}
return_seqs
}
#[rstest]
#[case(false, vec!["AA"], 1)] #[case(false, vec!["CCCG"], 0)] #[case(false, vec!["T"], 3)] #[case(false, vec!["^A"], 1)] #[case(false, vec!["T.....G"], 0)] #[case(false, vec!["G"], 4)] #[case(false, vec!["GGCC", "G..C"], 1)] #[case(false, vec!["Z", "A.....G"], 0)] #[case(false,vec!["^T", "AA"], 3)] #[case(true, vec!["AA"], 1)] #[case(true, vec!["CCCG"], 0)] #[case(true, vec!["CC"], 2)] #[case(true, vec!["^A"], 1)] #[case(true, vec!["T.....G"], 0)] #[case(true, vec!["G"], 3)] #[case(true, vec!["GGCC", "G..C"], 1)] #[case(true, vec!["Z", "A.....G"], 0)] #[case(true, vec!["^T", "AA"], 3)] fn test_reads_when_count_true(
#[case] paired: bool,
#[case] pattern: Vec<&str>,
#[case] expected: usize,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![
vec!["AAAA", "TTTT"],
vec!["GGGG", "CCCC"],
vec!["TTCT", "CGCG"],
vec!["GGTT", "GGCC"],
];
let pattern = pattern.iter().map(|&s| s.to_owned()).collect::<Vec<_>>();
let mut opts = build_opts(&dir, &seqs, &pattern, true, Vec::new(), String::from(".fq"));
opts.paired = paired;
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), expected);
}
#[rstest]
#[case(false, vec!["A"], vec!["AAAA", "ATAT", "TATA", "AAAT", "TTTA"], true)] #[case(false, vec!["A", "G"], vec!["AAAA", "ATAT", "TATA", "AAAT", "TTTA", "GGGG", "CGCG", "GCGC", "CCCG", "GGGC"], true)] #[case(false, vec!["^A"], vec!["AAAA", "ATAT", "AAAT"], true)] #[case(false, vec!["^A", "^G"], vec!["AAAA", "ATAT", "AAAT", "GGGG", "GCGC", "GGGC"], true)] #[case(true, vec!["AAAA"], vec!["AAAA", "CCCC"], true)] #[case(true, vec!["A", "G"], vec!["AAAA", "CCCC", "TTTT", "GGGG", "ATAT", "CGCG", "TATA", "GCGC", "AAAT", "CCCG", "TTTA", "GGGC"], true)]
#[case(true, vec!["A", "G"], vec!["AAAA", "GGGG", "TTTT", "CCCC", "CGCG", "ATAT", "GCGC", "TATA", "CCCG", "AAAT", "GGGC", "TTTA"], false)]
#[case(true, vec!["^AAAA"], vec!["AAAA", "CCCC"], true)] #[case(true, vec!["^AA", "^GG"], vec!["AAAA", "CCCC", "TTTT", "GGGG", "AAAT", "CCCG", "TTTA", "GGGC"], true)] #[case(true, vec!["^AA", "^GG"], vec!["AAAA", "GGGG", "TTTT", "CCCC", "TTTA", "CCCG", "AAAT", "GGGC"], false)] fn test_reads_when_count_false(
#[case] paired: bool,
#[case] pattern: Vec<&str>,
#[case] expected_seq: Vec<&str>,
#[case] expected_bool: bool,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![
vec!["AAAA", "TTTT", "ATAT", "TATA", "AAAT", "TTTA"],
vec!["CCCC", "GGGG", "CGCG", "GCGC", "CCCG", "GGGC"],
];
let out_path = dir.path().join(String::from("output.fq"));
let result_path = &out_path.clone();
let pattern = pattern.iter().map(|&s| s.to_owned()).collect::<Vec<_>>();
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
true,
vec![out_path],
String::from(".fq"),
);
opts.paired = paired;
let _result = fqgrep_from_opts(&opts);
let return_sequences = slurp_output(result_path.to_path_buf());
let sequence_match = expected_seq == return_sequences;
assert_eq!(sequence_match, expected_bool);
}
#[rstest]
#[case(true, vec!["MQR"], vec!["MQRFPW", "MQRHKD"])] #[case(true, vec!["FPW"], vec!["MQRFPW"])] #[case(true, vec!["ZZZ"], vec![])] #[case(false, vec!["^MQR"], vec!["MQRFPW", "MQRHKD"])] #[case(false, vec!["^MQR", "^RHK"], vec!["MQRFPW", "MQRHKD", "RHKDEW"])] #[case(false, vec!["^ZZZ", "^YYY"], vec![])] fn test_protein_ok(
#[case] fixed_strings: bool,
#[case] pattern: Vec<&str>,
#[case] expected_seq: Vec<&str>,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["MQRFPW", "MQRHKD", "RHKDEW", "WYFPQL"]];
let out_path = dir.path().join(String::from("output.fq"));
let result_path = &out_path.clone();
let pattern = pattern.iter().map(|&s| s.to_owned()).collect::<Vec<_>>();
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
true,
vec![out_path],
String::from(".fq"),
);
opts.protein = true;
opts.fixed_strings = fixed_strings;
let _result = fqgrep_from_opts(&opts);
let return_sequences = slurp_output(result_path.to_path_buf());
assert_eq!(return_sequences, expected_seq);
}
#[rstest]
#[case(false, false, false, 0)] #[case(false, false, true, 1)] #[case(false, true, false, 4)] #[case(false, true, true, 3)] #[case(true, false, false, 0)] #[case(true, false, true, 1)] #[case(true, true, false, 2)] #[case(true, true, true, 2)] fn test_reverse_complement_and_invert_match(
#[case] paired: bool,
#[case] invert_match: bool,
#[case] reverse_complement: bool,
#[case] expected: usize,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GGGG", "GGGG"], vec!["AAAA", "CCCC"]];
let pattern = vec![String::from("TTTT")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
false,
Vec::new(),
String::from(".fq"),
);
opts.paired = paired;
opts.invert_match = invert_match;
opts.reverse_complement = reverse_complement;
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), expected);
}
fn slurp_fastq(path: &PathBuf) -> Vec<OwnedRecord> {
let handle = File::open(path).unwrap();
let buf_handle = BufReader::with_capacity(BUFSIZE, handle);
let maybe_decoder_handle: Box<dyn Read> = if is_gzip_path(path) {
Box::new(MultiGzDecoder::new(buf_handle))
} else {
Box::new(buf_handle)
};
fastq::Reader::with_capacity(maybe_decoder_handle, BUFSIZE)
.into_records()
.map(|r| r.expect("Error reading"))
.collect::<Vec<_>>()
}
#[rstest]
fn test_paired_outputs() {
let dir: TempDir = TempDir::new().unwrap();
let seqs = vec![
vec!["GGGG", "AAAA", "AGGG", "TGCA"],
vec!["GGGG", "GGGA", "CCCC", "ACGT"],
];
let pattern = vec![String::from("GGG")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
false,
Vec::new(),
String::from(".fq"),
);
let r1_output = dir.path().join("out.r1.fq");
let r2_output = dir.path().join("out.r2.fq");
opts.paired = true;
opts.invert_match = false;
opts.reverse_complement = false;
opts.output = vec![r1_output, r2_output];
opts.count = false;
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), 3);
let r1_records = slurp_fastq(&opts.output[0]);
let r2_records = slurp_fastq(&opts.output[1]);
assert_eq!(r1_records.len(), 3);
assert_eq!(r2_records.len(), 3);
for (i, rec) in r1_records.iter().enumerate() {
let seq: String = String::from_utf8_lossy(rec.seq()).to_string();
assert_eq!(seq, seqs[0][i]);
}
for (i, rec) in r2_records.iter().enumerate() {
let seq: String = String::from_utf8_lossy(rec.seq()).to_string();
assert_eq!(seq, seqs[1][i]);
}
}
#[test]
fn test_fails_with_protein_and_reverse_complement() {
let result = Opts::try_parse_from(["fqgrep", "--protein", "--reverse-complement", "AAA"]);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("reverse-complement"));
}
#[rstest]
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. G"
)]
#[case(true, false, vec![String::from("^G")])] #[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. A"
)]
#[case(true, false, vec![String::from("^A"),String::from("AA")])] #[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q"
)]
#[case(true, true, vec![String::from("^Q")])] #[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q"
)]
#[case(true, true, vec![String::from("^Q"),String::from("QQ")])] fn test_regexp_from_fixed_string_fails_with_regex(
#[case] fixed_strings: bool,
#[case] protein: bool,
#[case] pattern: Vec<String>,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GGGG", "TTTT"], vec!["AAAA", "CCCC"]];
let mut opts = build_opts(&dir, &seqs, &pattern, true, Vec::new(), String::from(".fq"));
opts.fixed_strings = fixed_strings;
opts.protein = protein;
let _result = fqgrep_from_opts(&opts);
_result.unwrap();
}
#[test]
#[should_panic(
expected = "Input files must be a multiple of two, or either a single file or standard input (assume interleaved) with --paired"
)]
fn test_paired_should_panic_with_three_records() {
let dir = TempDir::new().unwrap();
let seqs = vec![
vec!["GTCAGCTCGAGCATCAGCTACGCACT"],
vec!["AGTGCGTAGCTGATGCTCGAGCTGAC"],
vec!["GGGTCTGAATCCATGGAAAGCTATTG"],
];
let test_pattern = vec![String::from("A")];
let mut opts_test = build_opts(
&dir,
&seqs,
&test_pattern,
true,
Vec::new(),
String::from(".fq"),
);
opts_test.paired = true;
let _num_matches = fqgrep_from_opts(&opts_test);
_num_matches.unwrap();
}
#[test]
fn test_regexp_matches_from_file_and_string() {
let dir = TempDir::new().unwrap();
let seqs = vec![
vec!["GTCAGCTCGAGCATCAGCTACGCACT"],
vec!["AGTGCGTAGCTGATGCTCGAGCTGAC"],
vec!["GGGTCTGAATCCATGGAAAGCTATTG"],
];
let test_pattern = vec![String::from("^G")];
let mut opts_test = build_opts(
&dir,
&seqs,
&test_pattern,
true,
Vec::new(),
String::from(".fq"),
);
let result = fqgrep_from_opts(&opts_test);
assert_eq!(result.unwrap(), 2);
opts_test.regexp = test_pattern;
let result = fqgrep_from_opts(&opts_test);
assert_eq!(result.unwrap(), 2);
}
#[rstest]
#[case(String::from(".fq"), 3)]
#[case(String::from(".fastq"), 3)]
#[case(String::from(".fq.gz"), 3)]
#[case(String::from(".fq.bgz"), 3)]
fn test_fastq_compression(#[case] extension: String, #[case] expected: usize) {
let dir = TempDir::new().unwrap();
let seqs = vec![
vec!["GTCAG", "ACGT", "GGTG"],
vec!["AGTGCGTAGCTGATGCTCGAGCTGAC"],
vec!["GGGTCTGAATCCATGGAAAGCTATTG"],
];
let test_pattern = vec![String::from("^G")];
let opts = build_opts(&dir, &seqs, &test_pattern, true, Vec::new(), extension);
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), expected);
}
#[rstest]
#[case(false, vec![String::from("*")], Some(2))] #[case(false, vec![String::from("^T")], Some(1))] #[case(true, vec![String::from("GTCAGC")], None)] #[case(true, vec![String::from("^T")], Some(2))] fn test_exit_code_catching(
#[case] fixed_strings: bool,
#[case] pattern: Vec<String>,
#[case] expected: Option<u8>,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GTCAGC"], vec!["AGTGCG"], vec!["GGGTCTG"]];
let mut opts = build_opts(&dir, &seqs, &pattern, true, Vec::new(), String::from(".fq"));
opts.fixed_strings = fixed_strings;
assert_eq!(fqgrep(&opts), expected);
}
#[rstest]
#[case(false, vec!["@0"], 1)] #[case(false, vec!["@0", "@2"], 2)] #[case(false, vec!["@99"], 0)] #[case(true, vec!["@0"], 3)] #[case(true, vec!["@0", "@1", "@2", "@3"], 0)] fn test_query_name_matching_unpaired(
#[case] invert_match: bool,
#[case] query_names: Vec<&str>,
#[case] expected: usize,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAA", "TTTT", "GGGG", "CCCC"]];
let query_names: Vec<String> = query_names.iter().map(|s| s.to_string()).collect();
let query_names_path = write_pattern(&dir, &query_names);
let mut opts = build_opts(&dir, &seqs, &vec![], false, Vec::new(), String::from(".fq"));
opts.read_names_file = Some(query_names_path);
opts.invert_match = invert_match;
opts.regexp = vec![];
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), expected);
}
#[rstest]
#[case(false, vec!["@0"], 1)] #[case(false, vec!["@0", "@1"], 2)] #[case(true, vec!["@0", "@1"], 0)] fn test_query_name_matching_paired(
#[case] invert_match: bool,
#[case] query_names: Vec<&str>,
#[case] expected: usize,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAA", "TTTT"], vec!["GGGG", "CCCC"]];
let query_names: Vec<String> = query_names.iter().map(|s| s.to_string()).collect();
let query_names_path = write_pattern(&dir, &query_names);
let mut opts = build_opts(&dir, &seqs, &vec![], false, Vec::new(), String::from(".fq"));
opts.read_names_file = Some(query_names_path);
opts.invert_match = invert_match;
opts.paired = true;
opts.regexp = vec![];
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), expected);
}
#[test]
fn test_query_name_matching_empty_file_returns_zero_matches() {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAA", "TTTT"]];
let query_names_path = write_pattern(&dir, &vec![]);
let mut opts = build_opts(&dir, &seqs, &vec![], false, Vec::new(), String::from(".fq"));
opts.read_names_file = Some(query_names_path);
opts.regexp = vec![];
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), 0);
}
#[test]
fn test_query_name_matching_empty_file_with_invert_returns_all() {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAA", "TTTT"]];
let query_names_path = write_pattern(&dir, &vec![]);
let mut opts = build_opts(&dir, &seqs, &vec![], false, Vec::new(), String::from(".fq"));
opts.read_names_file = Some(query_names_path);
opts.invert_match = true;
opts.regexp = vec![];
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), 2);
}
#[rstest]
#[case(IupacOption::Expand)]
#[case(IupacOption::Regex)]
#[case(IupacOption::BitMask)]
fn test_iupac_modes_single_pattern(#[case] iupac_mode: IupacOption) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GATG", "GATT", "GATA", "GATC"]];
let pattern = vec![String::from("GATK")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
false,
Vec::new(),
String::from(".fq"),
);
opts.fixed_strings = true;
opts.iupac = iupac_mode;
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), 2, "GATK should match GATG and GATT");
}
#[rstest]
#[case(IupacOption::Expand)]
#[case(IupacOption::Regex)]
#[case(IupacOption::BitMask)]
fn test_iupac_modes_multiple_patterns(#[case] iupac_mode: IupacOption) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GATG", "ACCA", "TTTT", "CCCC"]];
let pattern = vec![String::from("GATK"), String::from("ACS")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
false,
Vec::new(),
String::from(".fq"),
);
opts.fixed_strings = true;
opts.iupac = iupac_mode;
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), 2, "GATK matches GATG, ACS matches ACCA");
}
#[rstest]
#[case(IupacOption::Expand)]
#[case(IupacOption::Regex)]
#[case(IupacOption::BitMask)]
fn test_iupac_modes_invert_match(#[case] iupac_mode: IupacOption) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GATG", "GATT", "GATA", "GATC"]];
let pattern = vec![String::from("GATK")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
false,
Vec::new(),
String::from(".fq"),
);
opts.fixed_strings = true;
opts.iupac = iupac_mode;
opts.invert_match = true;
let result = fqgrep_from_opts(&opts);
assert_eq!(
result.unwrap(),
2,
"invert: GATA and GATC should not match GATK"
);
}
#[rstest]
#[case(IupacOption::Expand)]
#[case(IupacOption::Regex)]
#[case(IupacOption::BitMask)]
fn test_iupac_modes_n_wildcard(#[case] iupac_mode: IupacOption) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAT", "ACGT", "TTTT"]];
let pattern = vec![String::from("AN")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
false,
Vec::new(),
String::from(".fq"),
);
opts.fixed_strings = true;
opts.iupac = iupac_mode;
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), 2, "AN should match AAAT and ACGT");
}
#[rstest]
fn test_iupac_expand_exceeds_limit_returns_error() {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["ACGT"]];
let pattern = vec![String::from("NNNNNNNN")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
false,
Vec::new(),
String::from(".fq"),
);
opts.fixed_strings = true;
opts.iupac = IupacOption::Expand;
let result = fqgrep_from_opts(&opts);
assert!(
result.is_err(),
"Should return error for excessive IUPAC expansion"
);
}
#[rstest]
#[case(false)] #[case(true)] fn test_no_order_count(#[case] paired: bool) {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAA", "TTTT"], vec!["GGGG", "CCCC"]];
let pattern = vec![String::from("A")];
let mut opts = build_opts(&dir, &seqs, &pattern, true, Vec::new(), String::from(".fq"));
opts.no_order = true;
opts.paired = paired;
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), 1);
}
#[test]
fn test_no_order_unpaired_output() {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAA", "TTTT", "ATAT", "TATA"]];
let out_path = dir.path().join("output.fq");
let pattern = vec![String::from("A")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
true,
vec![out_path.clone()],
String::from(".fq"),
);
opts.no_order = true;
let _result = fqgrep_from_opts(&opts);
let mut return_sequences = slurp_output(out_path);
return_sequences.sort();
let mut expected = vec!["AAAA", "ATAT", "TATA"];
expected.sort();
assert_eq!(return_sequences, expected);
}
#[test]
fn test_no_order_paired_interleaved_output() {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["AAAA", "TTTT"], vec!["CCCC", "GGGG"]];
let out_path = dir.path().join("output.fq");
let pattern = vec![String::from("A")];
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
true,
vec![out_path.clone()],
String::from(".fq"),
);
opts.no_order = true;
opts.paired = true;
let _result = fqgrep_from_opts(&opts);
let mut return_sequences = slurp_output(out_path);
return_sequences.sort();
let mut expected = vec!["AAAA", "CCCC"];
expected.sort();
assert_eq!(return_sequences, expected);
}
}