use std::fs::File;
use std::io::{self, BufReader, Seek};
use std::path::Path;
use noodles::bam;
use noodles::bgzf;
use noodles::sam;
use noodles::sam::alignment::record::cigar::op::Kind as CigarKind;
use noodles::sam::alignment::record::Sequence as _;
use super::{Sequence, SequenceFile};
use crate::utils::dna::reverse_complement;
enum InnerReader {
Bam(bam::io::Reader<bgzf::Reader<File>>),
Sam(sam::io::Reader<BufReader<File>>),
}
pub struct BAMFile {
reader: InnerReader,
name: String,
file_size: u64,
only_mapped: bool,
position_handle: Option<File>,
next_sequence: Option<Sequence>,
is_bam: bool,
record_size: u64,
bam_record: bam::Record,
sam_record: sam::Record,
}
impl BAMFile {
pub fn open(path: &Path, is_bam: bool, only_mapped: bool) -> io::Result<Self> {
let name = path
.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_else(|| path.to_string_lossy().into_owned());
let file_size = std::fs::metadata(path)?.len();
let file = File::open(path)?;
let position_handle = file.try_clone().ok();
let reader = if is_bam {
let mut r = bam::io::Reader::new(file);
let _header = r.read_header()?;
InnerReader::Bam(r)
} else {
let buf = BufReader::new(file);
let mut r = sam::io::Reader::new(buf);
let _header = r.read_header()?;
InnerReader::Sam(r)
};
let mut bam_file = BAMFile {
reader,
name,
file_size,
only_mapped,
position_handle,
next_sequence: None,
is_bam,
record_size: 0,
bam_record: bam::Record::default(),
sam_record: sam::Record::default(),
};
bam_file.read_next()?;
Ok(bam_file)
}
fn read_next(&mut self) -> io::Result<()> {
loop {
let record_data = match &mut self.reader {
InnerReader::Bam(reader) => {
match reader.read_record(&mut self.bam_record) {
Ok(0) => None, Ok(_) => {
let rec = &self.bam_record;
let flags = rec.flags();
if self.only_mapped && flags.is_unmapped() {
continue;
}
let name = rec
.name()
.map(|n| String::from_utf8_lossy(n.as_ref()).into_owned())
.unwrap_or_else(|| "*".to_string());
let sequence_bases: Vec<u8> = rec.sequence().iter().collect();
let seq_len = sequence_bases.len();
let quality_bytes: Vec<u8> = rec
.quality_scores()
.as_ref()
.iter()
.map(|&q| q + 33) .collect();
let cigar_ops: Vec<(CigarKind, usize)> = rec
.cigar()
.iter()
.filter_map(|r| r.ok())
.map(|op| (op.kind(), op.len()))
.collect();
let is_reverse = flags.is_reverse_complemented();
let is_qc_fail = flags.is_qc_fail();
if self.record_size == 0 && seq_len > 0 {
let mut rs = (seq_len as u64 * 2) + 150;
if self.is_bam {
rs /= 4; }
self.record_size = rs;
}
Some(RecordData {
name,
sequence: sequence_bases,
quality: quality_bytes,
cigar_ops,
is_reverse,
is_qc_fail,
})
}
Err(e) => return Err(e),
}
}
InnerReader::Sam(reader) => {
match reader.read_record(&mut self.sam_record) {
Ok(0) => None, Ok(_) => {
let rec = &self.sam_record;
let flags = rec
.flags()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
if self.only_mapped && flags.is_unmapped() {
continue;
}
let name = rec
.name()
.map(|n| String::from_utf8_lossy(n.as_ref()).into_owned())
.unwrap_or_else(|| "*".to_string());
let sequence_bases: Vec<u8> = rec.sequence().iter().collect();
let seq_len = sequence_bases.len();
let quality_bytes: Vec<u8> = rec.quality_scores().as_ref().to_vec();
let cigar_ops: Vec<(CigarKind, usize)> = rec
.cigar()
.iter()
.filter_map(|r| r.ok())
.map(|op| (op.kind(), op.len()))
.collect();
let is_reverse = flags.is_reverse_complemented();
let is_qc_fail = flags.is_qc_fail();
if self.record_size == 0 && seq_len > 0 {
let mut rs = (seq_len as u64 * 2) + 150;
if self.is_bam {
rs /= 4;
}
self.record_size = rs;
}
Some(RecordData {
name,
sequence: sequence_bases,
quality: quality_bytes,
cigar_ops,
is_reverse,
is_qc_fail,
})
}
Err(e) => return Err(e),
}
}
};
match record_data {
None => {
self.next_sequence = None;
return Ok(());
}
Some(data) => {
let mut sequence = data.sequence;
let mut quality = data.quality;
if self.only_mapped && !data.cigar_ops.is_empty() {
if let Some(&(kind, len)) = data.cigar_ops.last() {
if kind == CigarKind::SoftClip {
let new_len = sequence.len().saturating_sub(len);
sequence.truncate(new_len);
quality.truncate(new_len);
}
}
if let Some(&(kind, len)) = data.cigar_ops.first() {
if kind == CigarKind::SoftClip {
if len <= sequence.len() {
sequence = sequence[len..].to_vec();
quality = quality[len..].to_vec();
}
}
}
}
if data.is_reverse {
sequence = reverse_complement(&sequence);
quality.reverse();
}
let mut seq = Sequence::new(data.name, sequence, quality);
if data.is_qc_fail {
seq.is_filtered = true;
}
self.next_sequence = Some(seq);
return Ok(());
}
}
}
}
}
struct RecordData {
name: String,
sequence: Vec<u8>,
quality: Vec<u8>,
cigar_ops: Vec<(CigarKind, usize)>,
is_reverse: bool,
is_qc_fail: bool,
}
impl SequenceFile for BAMFile {
fn next(&mut self) -> Option<io::Result<Sequence>> {
let current = self.next_sequence.take()?;
if let Err(e) = self.read_next() {
return Some(Err(e));
}
Some(Ok(current))
}
fn name(&self) -> &str {
&self.name
}
fn is_colorspace(&self) -> bool {
false
}
fn percent_complete(&self) -> f64 {
if self.next_sequence.is_none() {
return 100.0;
}
if let Some(ref handle) = self.position_handle {
if let Ok(mut h) = handle.try_clone() {
if let Ok(pos) = h.stream_position() {
return (pos as f64 / self.file_size as f64) * 100.0;
}
}
}
0.0
}
}
pub fn open_sequence_file(
config: &crate::config::FastQCConfig,
path: &Path,
) -> io::Result<Box<dyn SequenceFile>> {
use super::fastq::FastQFile;
if let Some(ref format) = config.sequence_format {
let format_lower = format.to_lowercase();
match format_lower.as_str() {
"bam" => {
return Ok(Box::new(BAMFile::open(path, true, false)?));
}
"sam" => {
return Ok(Box::new(BAMFile::open(path, false, false)?));
}
"bam_mapped" => {
return Ok(Box::new(BAMFile::open(path, true, true)?));
}
"sam_mapped" => {
return Ok(Box::new(BAMFile::open(path, false, true)?));
}
"fastq" => {
return Ok(Box::new(FastQFile::open(config, path)?));
}
other => {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("Unknown sequence format: {}", other),
));
}
}
}
let name_lower = path
.file_name()
.map(|n| n.to_string_lossy().to_lowercase())
.unwrap_or_default();
if name_lower.ends_with(".bam") || name_lower.ends_with(".ubam") {
Ok(Box::new(BAMFile::open(path, true, false)?))
} else if name_lower.ends_with(".sam") {
Ok(Box::new(BAMFile::open(path, false, false)?))
} else if name_lower.ends_with(".fast5") {
Ok(Box::new(super::fast5::Fast5File::open(path)?))
} else {
Ok(Box::new(FastQFile::open(config, path)?))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_format_detection_fast5() {
let config = crate::config::FastQCConfig::default();
let result = open_sequence_file(&config, Path::new("nonexistent.fast5"));
assert!(result.is_err());
}
#[test]
fn test_format_detection_unknown_format() {
let config = crate::config::FastQCConfig {
sequence_format: Some("unknown_format".to_string()),
..Default::default()
};
let result = open_sequence_file(&config, Path::new("test.dat"));
assert!(result.is_err());
}
}