use std::fs::File;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
use anyhow::{anyhow, Context, Result};
use flate2::read::MultiGzDecoder;
use crate::record::Record;
pub trait RecordReader {
fn next_record(&mut self) -> Result<Option<Record>>;
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Format {
Fastq,
Fasta,
Bam,
}
pub fn open_reader<P: AsRef<Path>>(path: P) -> Result<Box<dyn RecordReader>> {
let path = path.as_ref();
let format = detect_format(path)?;
match format {
Format::Fastq => {
let reader = FastqReader::open(path)?;
Ok(Box::new(reader))
}
Format::Fasta => {
let reader = FastaReader::open(path)?;
Ok(Box::new(reader))
}
Format::Bam => Err(anyhow!(
"BAM reading not yet implemented (path: {}). Tracking in Milestone 1.",
path.display()
)),
}
}
pub fn detect_format(path: &Path) -> Result<Format> {
let name = path
.file_name()
.and_then(|s| s.to_str())
.ok_or_else(|| anyhow!("Invalid file path: {}", path.display()))?
.to_lowercase();
let stem = strip_compression_ext(&name);
if stem.ends_with(".fastq") || stem.ends_with(".fq") {
Ok(Format::Fastq)
} else if stem.ends_with(".fasta") || stem.ends_with(".fa") || stem.ends_with(".fna") {
Ok(Format::Fasta)
} else if stem.ends_with(".bam") || stem.ends_with(".sam") || stem.ends_with(".cram") {
Ok(Format::Bam)
} else {
Err(anyhow!(
"Could not detect format from filename: {}. \
Supported extensions: .fastq[.gz], .fq[.gz], .fasta[.gz], .fa[.gz], .bam, .sam, .cram",
path.display()
))
}
}
fn strip_compression_ext(name: &str) -> &str {
for suffix in [".gz", ".bz2", ".xz", ".zst"] {
if let Some(stripped) = name.strip_suffix(suffix) {
return stripped;
}
}
name
}
fn decompressed(reader: impl Read + 'static) -> Result<Box<dyn BufRead>> {
let mut buf = BufReader::new(reader);
let is_gzip = {
let head = buf.fill_buf().context("reading input header")?;
head.len() >= 2 && head[0] == 0x1f && head[1] == 0x8b
};
if is_gzip {
Ok(Box::new(BufReader::new(MultiGzDecoder::new(buf))))
} else {
Ok(Box::new(buf))
}
}
pub struct FastqReader {
inner: noodles::fastq::io::Reader<Box<dyn BufRead>>,
}
impl FastqReader {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file =
File::open(path).with_context(|| format!("Failed to open file: {}", path.display()))?;
Self::from_reader(file)
}
pub fn from_reader<R: Read + 'static>(reader: R) -> Result<Self> {
let inner = decompressed(reader)?;
Ok(Self {
inner: noodles::fastq::io::Reader::new(inner),
})
}
}
impl RecordReader for FastqReader {
fn next_record(&mut self) -> Result<Option<Record>> {
let mut rec = noodles::fastq::Record::default();
let n = self
.inner
.read_record(&mut rec)
.map_err(|e| anyhow!("FASTQ parse error: {}", e))?;
if n == 0 {
return Ok(None);
}
let id = rec.name().to_vec();
let seq = rec.sequence().to_vec();
let qual = Some(rec.quality_scores().to_vec());
let desc = {
let d = rec.description();
if d.is_empty() {
None
} else {
Some(d.to_vec())
}
};
Ok(Some(Record::with_desc(id, seq, qual, desc)))
}
}
pub struct FastaReader {
inner: noodles::fasta::io::Reader<Box<dyn BufRead>>,
}
impl FastaReader {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file =
File::open(path).with_context(|| format!("Failed to open file: {}", path.display()))?;
Self::from_reader(file)
}
pub fn from_reader<R: Read + 'static>(reader: R) -> Result<Self> {
let inner = decompressed(reader)?;
Ok(Self {
inner: noodles::fasta::io::Reader::new(inner),
})
}
}
impl RecordReader for FastaReader {
fn next_record(&mut self) -> Result<Option<Record>> {
let mut definition = String::new();
let n = self
.inner
.read_definition(&mut definition)
.map_err(|e| anyhow!("FASTA parse error: {}", e))?;
if n == 0 {
return Ok(None);
}
let mut seq = Vec::new();
self.inner
.read_sequence(&mut seq)
.map_err(|e| anyhow!("FASTA parse error: {}", e))?;
let body = definition.trim_end();
let body = body.strip_prefix('>').unwrap_or(body);
let (id, desc) = match body.split_once(char::is_whitespace) {
Some((name, rest)) => (name.as_bytes().to_vec(), Some(rest.as_bytes().to_vec())),
None => (body.as_bytes().to_vec(), None),
};
Ok(Some(Record::with_desc(id, seq, None, desc)))
}
}
pub fn open_input<P: AsRef<Path>>(path: P) -> Result<Box<dyn RecordReader>> {
let path = path.as_ref();
if path == Path::new("-") {
let reader = FastqReader::from_reader(std::io::stdin())?;
Ok(Box::new(reader))
} else {
open_reader(path)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_format_fastq() {
assert_eq!(
detect_format(Path::new("foo.fastq")).unwrap(),
Format::Fastq
);
assert_eq!(detect_format(Path::new("foo.fq")).unwrap(), Format::Fastq);
assert_eq!(
detect_format(Path::new("foo.fastq.gz")).unwrap(),
Format::Fastq
);
assert_eq!(
detect_format(Path::new("foo.fq.gz")).unwrap(),
Format::Fastq
);
assert_eq!(
detect_format(Path::new("foo.fastq.bz2")).unwrap(),
Format::Fastq
);
}
#[test]
fn test_detect_format_fasta() {
assert_eq!(
detect_format(Path::new("foo.fasta")).unwrap(),
Format::Fasta
);
assert_eq!(detect_format(Path::new("foo.fa")).unwrap(), Format::Fasta);
assert_eq!(detect_format(Path::new("foo.fna")).unwrap(), Format::Fasta);
assert_eq!(
detect_format(Path::new("foo.fa.gz")).unwrap(),
Format::Fasta
);
}
#[test]
fn test_detect_format_bam() {
assert_eq!(detect_format(Path::new("foo.bam")).unwrap(), Format::Bam);
assert_eq!(detect_format(Path::new("foo.sam")).unwrap(), Format::Bam);
}
#[test]
fn test_detect_format_unknown() {
assert!(detect_format(Path::new("foo.txt")).is_err());
assert!(detect_format(Path::new("foo")).is_err());
}
}