use std::io::{self, BufRead, ErrorKind};
use std::num::NonZeroUsize;
use anyhow::{Context, Result, anyhow, bail};
use fgumi_raw_bam::{BAM_MAGIC, RawRecord};
use libdeflater::Decompressor;
use noodles_sam::Header;
use noodles_sam::header::ReferenceSequences;
use noodles_sam::header::record::value::Map;
use noodles_sam::header::record::value::map::ReferenceSequence;
const BGZF_HEADER_SIZE: usize = 18;
const BGZF_FOOTER_SIZE: usize = 8;
const BLOCKS_PER_REFILL: usize = 64;
pub(crate) struct RawBamReader<R: BufRead> {
inner: R,
decompressor: Decompressor,
buf: Vec<u8>,
pos: usize,
compressed_scratch: Vec<u8>,
eof: bool,
check_crc: bool,
}
impl<R: BufRead> RawBamReader<R> {
pub(crate) fn new(reader: R, check_crc: bool) -> Self {
Self {
inner: reader,
decompressor: Decompressor::new(),
buf: Vec::with_capacity(BLOCKS_PER_REFILL * 65_536),
pos: 0,
compressed_scratch: Vec::with_capacity(65_536),
eof: false,
check_crc,
}
}
pub(crate) fn read_header(&mut self) -> Result<Header> {
self.ensure_bytes(4)?;
if &self.buf[self.pos..self.pos + 4] != BAM_MAGIC {
bail!(
"Not a BAM file: expected magic {:?}, got {:?}",
BAM_MAGIC,
&self.buf[self.pos..self.pos + 4]
);
}
self.pos += 4;
let l_text = self.read_u32()? as usize;
self.ensure_bytes(l_text)?;
let text = &self.buf[self.pos..self.pos + l_text];
let mut header: Header = if text.is_empty() {
Header::default()
} else {
std::str::from_utf8(text)
.context("BAM header text is not valid UTF-8")?
.parse()
.context("parsing SAM header text")?
};
self.pos += l_text;
let n_ref = self.read_u32()? as usize;
let mut binary_refs = ReferenceSequences::with_capacity(n_ref);
for _ in 0..n_ref {
let l_name = self.read_u32()? as usize;
if l_name == 0 {
bail!("BAM reference name length is zero (spec requires NUL terminator)");
}
self.ensure_bytes(l_name)?;
let name = self.buf[self.pos..self.pos + l_name - 1].to_vec();
self.pos += l_name;
let l_ref = self.read_u32()? as usize;
let length = NonZeroUsize::new(l_ref)
.ok_or_else(|| anyhow!("reference sequence with zero length"))?;
binary_refs.insert(name.into(), Map::<ReferenceSequence>::new(length));
}
if header.reference_sequences().is_empty() {
*header.reference_sequences_mut() = binary_refs;
} else {
let text_refs = header.reference_sequences();
if text_refs.len() != binary_refs.len() {
bail!(
"BAM header @SQ count ({}) does not match binary ref list ({})",
text_refs.len(),
binary_refs.len(),
);
}
for ((t_name, t_map), (b_name, b_map)) in text_refs.iter().zip(binary_refs.iter()) {
if t_name != b_name {
bail!(
"BAM @SQ name mismatch at index: text={:?} binary={:?}",
String::from_utf8_lossy(t_name),
String::from_utf8_lossy(b_name),
);
}
if t_map.length() != b_map.length() {
bail!(
"BAM @SQ length mismatch for {}: text={} binary={}",
String::from_utf8_lossy(t_name),
usize::from(t_map.length()),
usize::from(b_map.length()),
);
}
}
}
Ok(header)
}
pub(crate) fn read_record(&mut self, rec: &mut RawRecord) -> io::Result<bool> {
if !self.has_bytes(4) {
self.try_refill()?;
if !self.has_bytes(4) {
return Ok(false);
}
}
let block_size = u32::from_le_bytes([
self.buf[self.pos],
self.buf[self.pos + 1],
self.buf[self.pos + 2],
self.buf[self.pos + 3],
]) as usize;
self.pos += 4;
self.ensure_bytes_io(block_size)?;
let vec = rec.as_mut_vec();
vec.clear();
vec.extend_from_slice(&self.buf[self.pos..self.pos + block_size]);
self.pos += block_size;
Ok(true)
}
fn read_u32(&mut self) -> Result<u32> {
self.ensure_bytes(4)?;
let n = u32::from_le_bytes([
self.buf[self.pos],
self.buf[self.pos + 1],
self.buf[self.pos + 2],
self.buf[self.pos + 3],
]);
self.pos += 4;
Ok(n)
}
#[inline]
fn has_bytes(&self, n: usize) -> bool {
self.buf.len() - self.pos >= n
}
fn ensure_bytes(&mut self, n: usize) -> Result<()> {
while !self.has_bytes(n) {
if !self.try_refill().context("reading BGZF blocks")? {
return Err(anyhow!(
"Unexpected EOF: needed {n} more bytes (have {}/{})",
self.buf.len() - self.pos,
n
));
}
}
Ok(())
}
fn ensure_bytes_io(&mut self, n: usize) -> io::Result<()> {
while !self.has_bytes(n) {
if !self.try_refill()? {
return Err(io::Error::new(
ErrorKind::UnexpectedEof,
format!(
"Truncated BAM record: needed {n} more bytes (have {}/{})",
self.buf.len() - self.pos,
n
),
));
}
}
Ok(())
}
fn try_refill(&mut self) -> io::Result<bool> {
if self.eof {
return Ok(false);
}
if self.pos > 0 {
let remaining = self.buf.len() - self.pos;
if remaining > 0 {
self.buf.copy_within(self.pos.., 0);
}
self.buf.truncate(remaining);
self.pos = 0;
}
let starting = self.buf.len();
for _ in 0..BLOCKS_PER_REFILL {
if !self.read_one_block()? {
break;
}
}
Ok(self.buf.len() > starting)
}
fn read_one_block(&mut self) -> io::Result<bool> {
let mut header = [0u8; BGZF_HEADER_SIZE];
match self.inner.read_exact(&mut header) {
Ok(()) => {}
Err(e) if e.kind() == ErrorKind::UnexpectedEof => {
self.eof = true;
return Ok(false);
}
Err(e) => return Err(e),
}
if header[0] != 0x1f || header[1] != 0x8b {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("Invalid BGZF magic 0x{:02x} 0x{:02x}", header[0], header[1]),
));
}
if header[12] != b'B' || header[13] != b'C' {
return Err(io::Error::new(ErrorKind::InvalidData, "missing BGZF BC subfield"));
}
let bsize = u16::from_le_bytes([header[16], header[17]]) as usize + 1;
if bsize < BGZF_HEADER_SIZE + BGZF_FOOTER_SIZE {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("BGZF block too small: {bsize}"),
));
}
let payload_len = bsize - BGZF_HEADER_SIZE - BGZF_FOOTER_SIZE;
let first = {
let peek = self.inner.fill_buf()?;
if peek.is_empty() {
return Err(io::Error::new(
ErrorKind::UnexpectedEof,
"truncated BGZF block payload",
));
}
peek[0]
};
let is_stored = (first & 0b110) == 0;
if is_stored {
let mut framing = [0u8; 5];
self.inner.read_exact(&mut framing)?;
let len = u16::from_le_bytes([framing[1], framing[2]]) as usize;
if len + 5 != payload_len {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("stored block size mismatch: LEN={len} payload={payload_len}"),
));
}
let start = self.buf.len();
read_exact_into_spare(&mut self.inner, &mut self.buf, len)?;
let mut footer = [0u8; BGZF_FOOTER_SIZE];
self.inner.read_exact(&mut footer)?;
if self.check_crc {
verify_block_crc(&self.buf[start..start + len], &footer, "stored")?;
}
} else {
self.compressed_scratch.clear();
read_exact_into_spare(&mut self.inner, &mut self.compressed_scratch, payload_len)?;
let mut footer = [0u8; BGZF_FOOTER_SIZE];
self.inner.read_exact(&mut footer)?;
let isize = u32::from_le_bytes([footer[4], footer[5], footer[6], footer[7]]) as usize;
if isize > 0 {
let start = self.buf.len();
self.buf.resize(start + isize, 0);
let n = self
.decompressor
.deflate_decompress(&self.compressed_scratch, &mut self.buf[start..])
.map_err(|e| {
io::Error::new(ErrorKind::InvalidData, format!("inflate failed: {e:?}"))
})?;
if n != isize {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!("BGZF isize mismatch: header={isize} decompressed={n}"),
));
}
if self.check_crc {
verify_block_crc(&self.buf[start..start + isize], &footer, "deflate")?;
}
}
}
Ok(true)
}
}
fn read_exact_into_spare<R: std::io::Read>(
src: &mut R,
buf: &mut Vec<u8>,
n: usize,
) -> io::Result<()> {
use std::mem::MaybeUninit;
buf.reserve(n);
let spare: &mut [MaybeUninit<u8>] = &mut buf.spare_capacity_mut()[..n];
let target: &mut [u8] =
unsafe { std::slice::from_raw_parts_mut(spare.as_mut_ptr() as *mut u8, spare.len()) };
src.read_exact(target)?;
unsafe { buf.set_len(buf.len() + n) };
Ok(())
}
fn verify_block_crc(
decompressed: &[u8],
footer: &[u8; BGZF_FOOTER_SIZE],
label: &'static str,
) -> io::Result<()> {
let expected = u32::from_le_bytes([footer[0], footer[1], footer[2], footer[3]]);
let actual = libdeflater::crc32(decompressed);
if actual != expected {
return Err(io::Error::new(
ErrorKind::InvalidData,
format!(
"BGZF {label}-block CRC32 mismatch: expected 0x{expected:08x}, got 0x{actual:08x}"
),
));
}
Ok(())
}