use crate::core::{GenomicReader, GenomicRecordIterator};
use crate::error::{Error, Result};
use flate2::{Decompress, FlushDecompress};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::Path;
const MAX_BLOCK_SIZE: usize = 65536;
const BAM_MAGIC: [u8; 4] = [b'B', b'A', b'M', 1];
const SEQ_LOOKUP: [u8; 16] = [
b'=', b'A', b'C', b'M', b'G', b'R', b'S', b'V', b'T', b'W', b'Y', b'H', b'K', b'D', b'B', b'N',
];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CigarOp {
Match(u32),
Ins(u32),
Del(u32),
RefSkip(u32),
SoftClip(u32),
HardClip(u32),
Pad(u32),
Equal(u32),
Diff(u32),
}
impl CigarOp {
pub fn from_encoded(encoded: u32) -> Result<Self> {
let len = encoded >> 4;
let op = (encoded & 0xF) as u8;
match op {
0 => Ok(CigarOp::Match(len)),
1 => Ok(CigarOp::Ins(len)),
2 => Ok(CigarOp::Del(len)),
3 => Ok(CigarOp::RefSkip(len)),
4 => Ok(CigarOp::SoftClip(len)),
5 => Ok(CigarOp::HardClip(len)),
6 => Ok(CigarOp::Pad(len)),
7 => Ok(CigarOp::Equal(len)),
8 => Ok(CigarOp::Diff(len)),
_ => Err(Error::InvalidInput(format!(
"Invalid CIGAR operation: {}",
op
))),
}
}
pub fn len(&self) -> u32 {
match self {
CigarOp::Match(l)
| CigarOp::Ins(l)
| CigarOp::Del(l)
| CigarOp::RefSkip(l)
| CigarOp::SoftClip(l)
| CigarOp::HardClip(l)
| CigarOp::Pad(l)
| CigarOp::Equal(l)
| CigarOp::Diff(l) => *l,
}
}
pub fn consumes_ref(&self) -> bool {
matches!(
self,
CigarOp::Match(_)
| CigarOp::Del(_)
| CigarOp::RefSkip(_)
| CigarOp::Equal(_)
| CigarOp::Diff(_)
)
}
pub fn consumes_query(&self) -> bool {
matches!(
self,
CigarOp::Match(_)
| CigarOp::Ins(_)
| CigarOp::SoftClip(_)
| CigarOp::Equal(_)
| CigarOp::Diff(_)
)
}
pub fn to_char(&self) -> char {
match self {
CigarOp::Match(_) => 'M',
CigarOp::Ins(_) => 'I',
CigarOp::Del(_) => 'D',
CigarOp::RefSkip(_) => 'N',
CigarOp::SoftClip(_) => 'S',
CigarOp::HardClip(_) => 'H',
CigarOp::Pad(_) => 'P',
CigarOp::Equal(_) => '=',
CigarOp::Diff(_) => 'X',
}
}
pub fn parse_cigar_string(cigar_str: &str) -> Result<Vec<Self>> {
if cigar_str == "*" {
return Ok(Vec::new());
}
let mut operations = Vec::new();
let mut num_str = String::new();
for ch in cigar_str.chars() {
if ch.is_ascii_digit() {
num_str.push(ch);
} else {
if num_str.is_empty() {
return Err(Error::Parse(format!(
"Invalid CIGAR string: operation '{}' without length",
ch
)));
}
let len = num_str
.parse::<u32>()
.map_err(|_| Error::Parse(format!("Invalid CIGAR length: {}", num_str)))?;
let op = match ch {
'M' => CigarOp::Match(len),
'I' => CigarOp::Ins(len),
'D' => CigarOp::Del(len),
'N' => CigarOp::RefSkip(len),
'S' => CigarOp::SoftClip(len),
'H' => CigarOp::HardClip(len),
'P' => CigarOp::Pad(len),
'=' => CigarOp::Equal(len),
'X' => CigarOp::Diff(len),
_ => return Err(Error::Parse(format!("Invalid CIGAR operation: {}", ch))),
};
operations.push(op);
num_str.clear();
}
}
if !num_str.is_empty() {
return Err(Error::Parse(format!(
"Invalid CIGAR string: trailing digits '{}'",
num_str
)));
}
Ok(operations)
}
}
impl std::fmt::Display for CigarOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}{}", self.len(), self.to_char())
}
}
#[derive(Debug, Clone)]
pub struct RefSeq {
pub name: String,
pub length: u32,
}
#[derive(Debug, Clone)]
pub struct BamHeader {
pub text: String,
pub references: Vec<RefSeq>,
}
impl BamHeader {
pub fn ref_name(&self, idx: i32) -> Option<&str> {
if idx < 0 {
return None;
}
self.references.get(idx as usize).map(|r| r.name.as_str())
}
}
#[derive(Debug, Clone)]
pub struct BamRecord {
pub qname: String,
pub flag: u16,
pub refid: i32,
pub rname: String,
pub pos: i32,
pub mapq: u8,
pub cigar: Vec<CigarOp>,
pub next_refid: i32,
pub next_pos: i32,
pub tlen: i32,
pub seq: Vec<u8>,
pub qual: Vec<u8>,
pub tags: HashMap<String, TagValue>,
}
impl BamRecord {
pub fn is_paired(&self) -> bool {
(self.flag & 0x1) != 0
}
pub fn is_proper_pair(&self) -> bool {
(self.flag & 0x2) != 0
}
pub fn is_unmapped(&self) -> bool {
(self.flag & 0x4) != 0
}
pub fn is_mate_unmapped(&self) -> bool {
(self.flag & 0x8) != 0
}
pub fn is_reverse(&self) -> bool {
(self.flag & 0x10) != 0
}
pub fn is_mate_reverse(&self) -> bool {
(self.flag & 0x20) != 0
}
pub fn is_first_in_pair(&self) -> bool {
(self.flag & 0x40) != 0
}
pub fn is_second_in_pair(&self) -> bool {
(self.flag & 0x80) != 0
}
pub fn is_secondary(&self) -> bool {
(self.flag & 0x100) != 0
}
pub fn is_qc_fail(&self) -> bool {
(self.flag & 0x200) != 0
}
pub fn is_duplicate(&self) -> bool {
(self.flag & 0x400) != 0
}
pub fn is_supplementary(&self) -> bool {
(self.flag & 0x800) != 0
}
pub fn cigar_string(&self) -> String {
self.cigar.iter().map(|op| op.to_string()).collect()
}
pub fn seq_string(&self) -> String {
String::from_utf8_lossy(&self.seq).to_string()
}
}
#[derive(Debug, Clone)]
pub enum TagValue {
Char(u8),
Int(i32),
Float(f32),
String(String),
ByteArray(Vec<u8>),
}
pub struct BamReader<R: Read> {
inner: BufReader<R>,
current_block: Vec<u8>,
block_pos: usize,
eof: bool,
header: Option<BamHeader>,
decompressor: Decompress,
compressed_buf: Vec<u8>,
blocks_decompressed: usize,
file_path: Option<std::path::PathBuf>,
}
impl BamReader<File> {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let path_buf = path.as_ref().to_path_buf();
let file = File::open(&path_buf)?;
let mut reader = Self::new(file);
reader.file_path = Some(path_buf);
Ok(reader)
}
pub fn compute_stats(
self,
config: Option<crate::parallel::ParallelConfig>,
) -> Result<super::stats::BamStats> {
super::stats::BamStats::par_compute(self, config)
}
}
impl<R: Read> BamReader<R> {
pub fn new(reader: R) -> Self {
Self {
inner: BufReader::new(reader),
current_block: Vec::new(),
block_pos: 0,
eof: false,
header: None,
decompressor: Decompress::new(false), compressed_buf: Vec::with_capacity(MAX_BLOCK_SIZE),
blocks_decompressed: 0,
file_path: None,
}
}
pub fn read_header(&mut self) -> Result<&BamHeader> {
if self.header.is_some() {
return Ok(self.header.as_ref().unwrap());
}
let mut magic = [0u8; 4];
self.read_exact(&mut magic)?;
if magic != BAM_MAGIC {
return Err(Error::InvalidInput(format!(
"Invalid BAM magic number: expected {:?}, got {:?}",
BAM_MAGIC, magic
)));
}
let mut l_text_buf = [0u8; 4];
self.read_exact(&mut l_text_buf)?;
let l_text = u32::from_le_bytes(l_text_buf) as usize;
let mut text_buf = vec![0u8; l_text];
self.read_exact(&mut text_buf)?;
let text = String::from_utf8_lossy(&text_buf).to_string();
let mut n_ref_buf = [0u8; 4];
self.read_exact(&mut n_ref_buf)?;
let n_ref = u32::from_le_bytes(n_ref_buf) as usize;
let mut references = Vec::with_capacity(n_ref);
for _ in 0..n_ref {
let mut l_name_buf = [0u8; 4];
self.read_exact(&mut l_name_buf)?;
let l_name = u32::from_le_bytes(l_name_buf) as usize;
let mut name_buf = vec![0u8; l_name];
self.read_exact(&mut name_buf)?;
if name_buf.last() == Some(&0) {
name_buf.pop();
}
let name = String::from_utf8_lossy(&name_buf).to_string();
let mut l_ref_buf = [0u8; 4];
self.read_exact(&mut l_ref_buf)?;
let length = u32::from_le_bytes(l_ref_buf);
references.push(RefSeq { name, length });
}
self.header = Some(BamHeader { text, references });
Ok(self.header.as_ref().unwrap())
}
pub fn header(&self) -> Option<&BamHeader> {
self.header.as_ref()
}
pub fn set_header(&mut self, header: BamHeader) {
self.header = Some(header);
}
pub fn blocks_decompressed(&self) -> usize {
self.blocks_decompressed
}
pub fn file_path(&self) -> Option<&std::path::Path> {
self.file_path.as_deref()
}
pub fn parse_record(&self, data: &[u8]) -> Result<BamRecord> {
let mut pos = 0;
macro_rules! read_bytes {
($n:expr) => {{
if pos + $n > data.len() {
return Err(Error::InvalidInput(
"Unexpected end of BAM record".to_string(),
));
}
let bytes = &data[pos..pos + $n];
pos += $n;
bytes
}};
}
let _block_size = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
let refid = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
let rname = if let Some(header) = &self.header {
header
.ref_name(refid)
.unwrap_or("*") .to_string()
} else {
if refid >= 0 {
format!("refid_{}", refid)
} else {
"*".to_string()
}
};
let record_pos = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
let l_read_name = read_bytes!(1)[0];
let mapq = read_bytes!(1)[0];
let _bin = u16::from_le_bytes(read_bytes!(2).try_into().unwrap()); let n_cigar_op = u16::from_le_bytes(read_bytes!(2).try_into().unwrap());
let flag = u16::from_le_bytes(read_bytes!(2).try_into().unwrap());
let l_seq = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
let next_refid = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
let next_pos = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
let tlen = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
let qname_bytes = read_bytes!(l_read_name as usize);
let qname_len = if qname_bytes.last() == Some(&0) {
qname_bytes.len() - 1
} else {
qname_bytes.len()
};
let qname = String::from_utf8_lossy(&qname_bytes[..qname_len]).to_string();
let mut cigar = Vec::with_capacity(n_cigar_op as usize);
for _ in 0..n_cigar_op {
let encoded = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
cigar.push(CigarOp::from_encoded(encoded)?);
}
let seq_bytes_len = ((l_seq + 1) / 2) as usize;
let seq_buf = read_bytes!(seq_bytes_len);
let mut seq = Vec::with_capacity(l_seq as usize);
for byte in seq_buf {
seq.push(SEQ_LOOKUP[(byte >> 4) as usize]);
seq.push(SEQ_LOOKUP[(byte & 0x0F) as usize]);
}
seq.truncate(l_seq as usize);
let qual_bytes = read_bytes!(l_seq as usize);
let qual = if l_seq > 0 && qual_bytes[0] == 0xFF && qual_bytes.iter().all(|&q| q == 0xFF) {
Vec::new()
} else {
qual_bytes.to_vec()
};
let mut tags = HashMap::new();
while pos < data.len() {
if pos + 3 > data.len() {
break;
}
let tag_bytes = read_bytes!(2);
let tag = String::from_utf8_lossy(tag_bytes).to_string();
let val_type = read_bytes!(1)[0];
let value = match val_type {
b'A' => TagValue::Char(read_bytes!(1)[0]),
b'c' => TagValue::Int(i8::from_le_bytes(read_bytes!(1).try_into().unwrap()) as i32),
b'C' => TagValue::Int(read_bytes!(1)[0] as i32),
b's' => {
TagValue::Int(i16::from_le_bytes(read_bytes!(2).try_into().unwrap()) as i32)
}
b'S' => {
TagValue::Int(u16::from_le_bytes(read_bytes!(2).try_into().unwrap()) as i32)
}
b'i' => TagValue::Int(i32::from_le_bytes(read_bytes!(4).try_into().unwrap())),
b'I' => {
TagValue::Int(u32::from_le_bytes(read_bytes!(4).try_into().unwrap()) as i32)
}
b'f' => TagValue::Float(f32::from_le_bytes(read_bytes!(4).try_into().unwrap())),
b'Z' => {
let start = pos;
while pos < data.len() && data[pos] != 0 {
pos += 1;
}
let s = &data[start..pos];
if pos < data.len() {
pos += 1; }
TagValue::String(String::from_utf8_lossy(s).to_string())
}
b'H' => {
let start = pos;
while pos < data.len() && data[pos] != 0 {
pos += 1;
}
let s = &data[start..pos];
if pos < data.len() {
pos += 1; }
TagValue::String(String::from_utf8_lossy(s).to_string())
}
b'B' => {
let array_type = read_bytes!(1)[0];
let count = u32::from_le_bytes(read_bytes!(4).try_into().unwrap()) as usize;
let mut arr = Vec::new();
for _ in 0..count {
match array_type {
b'c' => arr.push(read_bytes!(1)[0]),
b'C' => arr.push(read_bytes!(1)[0]),
b's' => arr.extend_from_slice(read_bytes!(2)),
b'S' => arr.extend_from_slice(read_bytes!(2)),
b'i' => arr.extend_from_slice(read_bytes!(4)),
b'I' => arr.extend_from_slice(read_bytes!(4)),
b'f' => arr.extend_from_slice(read_bytes!(4)),
_ => {
return Err(Error::InvalidInput(format!(
"Unknown array type: {}",
array_type
)))
}
}
}
TagValue::ByteArray(arr)
}
_ => {
return Err(Error::InvalidInput(format!(
"Unknown tag type: {}",
val_type as char
)))
}
};
tags.insert(tag, value);
}
Ok(BamRecord {
qname,
flag,
refid,
rname,
pos: record_pos,
mapq,
cigar,
next_refid,
next_pos,
tlen,
seq,
qual,
tags,
})
}
pub fn records(&mut self) -> BamRecordIterator<'_, R> {
BamRecordIterator { reader: self }
}
fn read_exact(&mut self, buf: &mut [u8]) -> std::io::Result<()> {
let mut total_read = 0;
while total_read < buf.len() {
let n = self.read(&mut buf[total_read..])?;
if n == 0 {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"unexpected end of file",
));
}
total_read += n;
}
Ok(())
}
fn read_next_block(&mut self) -> Result<bool> {
if self.eof {
return Ok(false);
}
let mut header = [0u8; 10];
match self.inner.read_exact(&mut header) {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
self.eof = true;
return Ok(false);
}
Err(e) => return Err(Error::Io(e)),
}
if header[0] != 31 || header[1] != 139 {
return Err(Error::InvalidInput(
"Invalid bam block: bad gzip magic number".to_string(),
));
}
let flags = header[3];
let has_extra = (flags & 0x04) != 0;
if !has_extra {
return Err(Error::InvalidInput(
"Invalid bam block: missing extra field".to_string(),
));
}
let mut xlen_buf = [0u8; 2];
self.inner.read_exact(&mut xlen_buf)?;
let xlen = u16::from_le_bytes(xlen_buf) as usize;
let mut extra = vec![0u8; xlen];
self.inner.read_exact(&mut extra)?;
let bsize = self.parse_bam_extra(&extra)?;
let compressed_size = bsize + 1 - 10 - 2 - xlen - 8;
self.compressed_buf.clear();
self.compressed_buf.resize(compressed_size, 0);
self.inner.read_exact(&mut self.compressed_buf)?;
let mut footer = [0u8; 8];
self.inner.read_exact(&mut footer)?;
self.decompressor.reset(false);
self.current_block.clear();
self.current_block.resize(MAX_BLOCK_SIZE, 0);
let before_out = self.decompressor.total_out();
self.decompressor
.decompress(
&self.compressed_buf,
&mut self.current_block,
FlushDecompress::Finish,
)
.map_err(|e| Error::InvalidInput(format!("Decompression failed: {}", e)))?;
let bytes_out = (self.decompressor.total_out() - before_out) as usize;
self.current_block.truncate(bytes_out);
if self.current_block.len() > MAX_BLOCK_SIZE {
return Err(Error::InvalidInput(format!(
"BGZF block too large: {} bytes (max {})",
self.current_block.len(),
MAX_BLOCK_SIZE
)));
}
self.block_pos = 0;
self.blocks_decompressed += 1;
Ok(true)
}
fn parse_bam_extra(&self, extra: &[u8]) -> Result<usize> {
let mut pos = 0;
while pos + 4 <= extra.len() {
let si1 = extra[pos];
let si2 = extra[pos + 1];
let slen = u16::from_le_bytes([extra[pos + 2], extra[pos + 3]]) as usize;
if si1 == 66 && si2 == 67 {
if pos + 4 + slen > extra.len() {
return Err(Error::InvalidInput("bam extra field truncated".to_string()));
}
if slen != 2 {
return Err(Error::InvalidInput(format!(
"Invalid bam subfield length: {}",
slen
)));
}
let bsize = u16::from_le_bytes([extra[pos + 4], extra[pos + 5]]) as usize;
return Ok(bsize);
}
pos += 4 + slen;
}
Err(Error::InvalidInput(
"bam subfield not found in extra field".to_string(),
))
}
}
impl<R: Read> Read for BamReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
if buf.is_empty() {
return Ok(0);
}
if self.block_pos >= self.current_block.len() {
match self.read_next_block() {
Ok(true) => {}
Ok(false) => return Ok(0), Err(e) => return Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
}
}
let available = self.current_block.len() - self.block_pos;
let to_copy = buf.len().min(available);
buf[..to_copy]
.copy_from_slice(&self.current_block[self.block_pos..self.block_pos + to_copy]);
self.block_pos += to_copy;
Ok(to_copy)
}
}
impl<R: Read> GenomicRecordIterator for BamReader<R> {
type Record = BamRecord;
fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
let mut size_buf = [0u8; 4];
match self.read_exact(&mut size_buf) {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
return Ok(None);
}
Err(e) => return Err(e.into()),
}
let block_size = u32::from_le_bytes(size_buf);
let mut record_data = vec![0u8; block_size as usize + 4]; record_data[0..4].copy_from_slice(&size_buf);
self.read_exact(&mut record_data[4..])?;
Ok(Some(record_data))
}
fn next_record(&mut self) -> Result<Option<Self::Record>> {
let record_data = self.next_raw()?;
if record_data.is_none() {
return Ok(None);
}
let record_data = record_data.unwrap();
self.parse_record(&record_data).map(Some)
}
}
impl<R: Read> GenomicReader for BamReader<R> {
type Metadata = BamHeader;
fn metadata(&self) -> &Self::Metadata {
self.header
.as_ref()
.expect("Header not read yet. Call read_header() first.")
}
}
pub struct BamRecordIterator<'a, R: Read> {
reader: &'a mut BamReader<R>,
}
impl<'a, R: Read> Iterator for BamRecordIterator<'a, R> {
type Item = Result<BamRecord>;
fn next(&mut self) -> Option<Self::Item> {
match self.reader.next_record() {
Ok(Some(record)) => Some(Ok(record)),
Ok(None) => None,
Err(e) => Some(Err(e)),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_cigar_op_encoding() {
let encoded = (10 << 4) | 0; let op = CigarOp::from_encoded(encoded).unwrap();
assert_eq!(op, CigarOp::Match(10));
assert_eq!(op.len(), 10);
assert!(op.consumes_ref());
assert!(op.consumes_query());
assert_eq!(op.to_char(), 'M');
}
#[test]
fn test_cigar_op_display() {
assert_eq!(CigarOp::Match(10).to_string(), "10M");
assert_eq!(CigarOp::Ins(5).to_string(), "5I");
assert_eq!(CigarOp::Del(3).to_string(), "3D");
}
#[test]
fn test_bam_reader_creation() {
let data = vec![0u8; 1024];
let cursor = Cursor::new(data);
let _reader = BamReader::new(cursor);
}
#[test]
fn test_bam_reader_invalid_magic() {
let data = vec![0u8; 100];
let cursor = Cursor::new(data);
let mut reader = BamReader::new(cursor);
let mut buf = vec![0u8; 10];
let result = reader.read(&mut buf);
assert!(result.is_err() || result.unwrap() == 0);
}
#[test]
fn test_parse_bgzf_extra_valid() {
let reader = BamReader::new(Cursor::new(vec![]));
let extra = vec![66, 67, 2, 0, 100, 0];
let result = reader.parse_bam_extra(&extra);
assert!(result.is_ok());
assert_eq!(result.unwrap(), 100);
}
#[test]
fn test_parse_bgzf_extra_invalid() {
let reader = BamReader::new(Cursor::new(vec![]));
let extra = vec![1, 2, 2, 0, 100, 0];
let result = reader.parse_bam_extra(&extra);
assert!(result.is_err());
}
#[test]
fn test_parse_bgzf_extra_truncated() {
let reader = BamReader::new(Cursor::new(vec![]));
let extra = vec![66, 67, 2, 0];
let result = reader.parse_bam_extra(&extra);
assert!(result.is_err());
}
#[test]
fn test_max_block_size() {
assert_eq!(MAX_BLOCK_SIZE, 65536);
}
#[test]
fn test_bam_reader_from_empty() {
let cursor = Cursor::new(vec![]);
let mut reader = BamReader::new(cursor);
let mut buf = vec![0u8; 10];
let result = reader.read(&mut buf);
assert!(result.is_ok());
assert_eq!(result.unwrap(), 0);
}
}