use std::collections::HashSet;
use std::error::Error;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::process;
use clap::{Arg, Command};
use flate2::read::MultiGzDecoder;
use rustynetics::bam::{BamReader, BamReaderOptions};
use rustynetics::fastq::FastqReader;
use rustynetics::progress::{CountingReader, ProgressScope};
struct Config {
filename_bam: String,
filename_fastq: String,
keep_pair_suffixes: bool,
report_limit: usize,
}
struct CheckResult {
fastq_records: usize,
fastq_unique_names: usize,
bam_records: usize,
missing_records: usize,
missing_unique: usize,
missing_examples: Vec<String>,
}
const STATUS_RECORD_UPDATE_INTERVAL: usize = 10_000;
fn normalize_read_name(name: &str, keep_pair_suffixes: bool) -> String {
let token = name.split_whitespace().next().unwrap_or("");
let token = token.strip_prefix('@').unwrap_or(token);
if keep_pair_suffixes {
token.to_string()
} else {
token
.strip_suffix("/1")
.or_else(|| token.strip_suffix("/2"))
.unwrap_or(token)
.to_string()
}
}
fn open_fastq_reader(
filename: &str,
progress: Option<rustynetics::progress::ProgressHandle>,
) -> Result<Box<dyn BufRead>, Box<dyn Error>> {
let file = File::open(filename)?;
let reader = CountingReader::new(file, progress);
let mut reader = BufReader::new(reader);
let magic = reader.fill_buf()?;
if magic.len() >= 2 && magic[0] == 0x1f && magic[1] == 0x8b {
Ok(Box::new(BufReader::new(MultiGzDecoder::new(reader))))
} else {
Ok(Box::new(reader))
}
}
fn read_fastq_names(
filename: &str,
keep_pair_suffixes: bool,
) -> Result<(HashSet<String>, usize), Box<dyn Error>> {
let total_bytes = File::open(filename)?.metadata()?.len();
let mut progress = ProgressScope::new("FASTQ", total_bytes);
let reader = open_fastq_reader(filename, progress.handle())?;
let mut reader = FastqReader::new(reader);
let mut names = HashSet::new();
let mut records = 0usize;
while let Some(record) = reader.read_record()? {
let name = normalize_read_name(record.name(), keep_pair_suffixes);
if name.is_empty() {
return Err(io::Error::new(io::ErrorKind::InvalidData, "empty FASTQ read name").into());
}
names.insert(name);
records += 1;
if records % STATUS_RECORD_UPDATE_INTERVAL == 0 {
progress.set_records(records);
}
}
progress.finish(records);
Ok((names, records))
}
fn check_bam_fastq(config: &Config) -> Result<CheckResult, Box<dyn Error>> {
let (fastq_names, fastq_records) =
read_fastq_names(&config.filename_fastq, config.keep_pair_suffixes)?;
let options = BamReaderOptions {
read_name: true,
read_cigar: false,
read_sequence: false,
read_auxiliary: false,
read_qual: false,
};
let total_bytes = File::open(&config.filename_bam)?.metadata()?.len();
let mut progress = ProgressScope::new("BAM", total_bytes);
let file = File::open(&config.filename_bam)?;
let reader = BufReader::new(CountingReader::new(file, progress.handle()));
let mut bam_reader = BamReader::new(reader, Some(options))?;
let mut bam_records = 0usize;
let mut missing_records = 0usize;
let mut missing_names = HashSet::new();
let mut missing_examples = Vec::new();
for result in bam_reader.read_single_end() {
let block = result?.block;
let name = normalize_read_name(&block.read_name, config.keep_pair_suffixes);
bam_records += 1;
if bam_records % STATUS_RECORD_UPDATE_INTERVAL == 0 {
progress.set_records(bam_records);
}
if !fastq_names.contains(&name) {
missing_records += 1;
if missing_names.insert(name.clone()) && missing_examples.len() < config.report_limit {
missing_examples.push(name);
}
}
}
progress.finish(bam_records);
Ok(CheckResult {
fastq_records,
fastq_unique_names: fastq_names.len(),
bam_records,
missing_records,
missing_unique: missing_names.len(),
missing_examples,
})
}
fn main() {
let matches = Command::new("bam-check-fastq")
.version("1.0")
.author("Philipp Benner [https://github.com/pbenner]")
.about("Checks whether all BAM read names are present in a FASTQ file")
.arg(
Arg::new("bam")
.help("The input BAM file")
.required(true)
.index(1),
)
.arg(
Arg::new("fastq")
.help("The FASTQ file to compare against (plain text or gzip compressed)")
.required(true)
.index(2),
)
.arg(
Arg::new("keep-pair-suffixes")
.long("keep-pair-suffixes")
.action(clap::ArgAction::SetTrue)
.help("Do not strip trailing /1 or /2 suffixes before comparing read names"),
)
.arg(
Arg::new("report-limit")
.short('n')
.long("report-limit")
.value_parser(clap::value_parser!(usize))
.default_value("10")
.help("Maximum number of missing read names to print"),
)
.get_matches();
let config = Config {
filename_bam: matches.get_one::<String>("bam").unwrap().clone(),
filename_fastq: matches.get_one::<String>("fastq").unwrap().clone(),
keep_pair_suffixes: matches.get_flag("keep-pair-suffixes"),
report_limit: *matches.get_one::<usize>("report-limit").unwrap(),
};
match check_bam_fastq(&config) {
Ok(result) => {
if result.missing_unique == 0 {
println!(
"All {} BAM records matched read names in {} FASTQ records ({} unique names).",
result.bam_records, result.fastq_records, result.fastq_unique_names
);
} else {
eprintln!(
"Found {} missing BAM records covering {} unique read names not present in {} FASTQ records ({} unique names).",
result.missing_records,
result.missing_unique,
result.fastq_records,
result.fastq_unique_names
);
for name in result.missing_examples {
eprintln!("missing read: {}", name);
}
process::exit(1);
}
}
Err(e) => {
eprintln!("Error: {}", e);
process::exit(1);
}
}
}
#[cfg(test)]
mod tests {
use super::normalize_read_name;
#[test]
fn normalize_fastq_headers() {
assert_eq!(normalize_read_name("@read/1 comment", false), "read");
assert_eq!(normalize_read_name("@read/2", false), "read");
assert_eq!(normalize_read_name("read", false), "read");
}
#[test]
fn keep_pair_suffixes_when_requested() {
assert_eq!(normalize_read_name("@read/1 comment", true), "read/1");
}
}