use crate::gzi::GziIndex;
use flate2::Decompress;
use std::io::{self, BufRead, Read, Seek, SeekFrom};
const GZIP_ID1: u8 = 0x1f;
const GZIP_ID2: u8 = 0x8b;
const GZIP_CM_DEFLATE: u8 = 8;
const GZIP_FLG_FEXTRA: u8 = 4;
#[allow(dead_code)]
const GZIP_OS_UNKNOWN: u8 = 255;
const BGZF_EXTRA_ID: u8 = 66; const BGZF_EXTRA_SUBFIELD: u8 = 67; const BGZF_MAX_BLOCK_SIZE: usize = 64 * 1024;
pub struct BgzfReader<R: Read + Seek>
{
inner: R,
gzi_index: Option<GziIndex>,
decompressed_buf: Vec<u8>,
buf_pos: usize,
current_uncompressed_pos: u64,
eof: bool,
}
impl<R: Read + Seek> BgzfReader<R>
{
pub fn new(inner: R) -> Self
{
Self {
inner,
gzi_index: None,
decompressed_buf: Vec::new(),
buf_pos: 0,
current_uncompressed_pos: 0,
eof: false,
}
}
pub fn with_index(mut inner: R, gzi_index: GziIndex) -> io::Result<Self>
{
inner.seek(SeekFrom::Start(0))?;
Ok(Self {
inner,
gzi_index: Some(gzi_index),
decompressed_buf: Vec::new(),
buf_pos: 0,
current_uncompressed_pos: 0,
eof: false,
})
}
pub fn seek_uncompressed(&mut self, uncompressed_pos: u64) -> io::Result<u64>
{
let gzi = self.gzi_index.as_ref().ok_or_else(|| {
io::Error::new(io::ErrorKind::NotFound, "No .gzi index available for seeking")
})?;
let compressed_offset = gzi.get_compressed_offset(uncompressed_pos).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidInput,
format!("Uncompressed offset {} beyond index range", uncompressed_pos),
)
})?;
let block_start_uncompressed = gzi.get_uncompressed_offset(compressed_offset).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Could not find uncompressed offset for compressed offset {}", compressed_offset),
)
})?;
self.inner.seek(SeekFrom::Start(compressed_offset))?;
self.decompressed_buf.clear();
self.buf_pos = 0;
self.current_uncompressed_pos = block_start_uncompressed;
while self.current_uncompressed_pos < uncompressed_pos
{
if !self.read_next_block()?
{
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"Reached end of file before target position",
));
}
}
let offset_in_block = (uncompressed_pos
- (self.current_uncompressed_pos - self.decompressed_buf.len() as u64))
as usize;
self.buf_pos = offset_in_block;
Ok(uncompressed_pos)
}
pub fn current_position(&self) -> u64
{
if self.decompressed_buf.is_empty()
{
self.current_uncompressed_pos
}
else
{
self.current_uncompressed_pos - self.decompressed_buf.len() as u64 + self.buf_pos as u64
}
}
pub fn gzi_index(&self) -> Option<&GziIndex>
{
self.gzi_index.as_ref()
}
fn read_next_block(&mut self) -> io::Result<bool>
{
let mut header = [0u8; 12];
let mut total_read = 0;
while total_read < 12
{
let n = self.inner.read(&mut header[total_read..])?;
if n == 0
{
break;
}
total_read += n;
}
if total_read == 0
{
self.eof = true;
return Ok(false);
}
if total_read < 12
{
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "Incomplete BGZF header"));
}
if header[0] != GZIP_ID1 || header[1] != GZIP_ID2
{
return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid gzip magic number"));
}
if header[2] != GZIP_CM_DEFLATE
{
return Err(io::Error::new(io::ErrorKind::InvalidData, "Not deflate compression"));
}
let flg = header[3];
let xlen = if flg & GZIP_FLG_FEXTRA != 0
{
u16::from_le_bytes([header[10], header[11]]) as usize
}
else
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"BGZF requires extra field (FEXTRA flag not set)",
));
};
let mut extra = vec![0u8; xlen];
self.inner.read_exact(&mut extra)?;
let mut remaining_xlen = xlen;
let mut block_size = None;
while remaining_xlen >= 4
{
let si1 = extra[xlen - remaining_xlen];
let si2 = extra[xlen - remaining_xlen + 1];
let sublen = u16::from_le_bytes([
extra[xlen - remaining_xlen + 2],
extra[xlen - remaining_xlen + 3],
]) as usize;
if si1 == BGZF_EXTRA_ID && si2 == BGZF_EXTRA_SUBFIELD && sublen >= 2
{
let bsize = u16::from_le_bytes([
extra[xlen - remaining_xlen + 4],
extra[xlen - remaining_xlen + 5],
]);
block_size = Some(bsize as usize);
break;
}
if sublen > remaining_xlen.saturating_sub(4)
{
break;
}
remaining_xlen -= 4 + sublen;
}
let block_size = block_size.ok_or_else(|| {
io::Error::new(io::ErrorKind::InvalidData, "BC subfield not found in BGZF extra field")
})?;
let compressed_size = (block_size as isize + 1) - 12 - xlen as isize - 8;
if compressed_size <= 0
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid BGZF block size: {}, xlen: {}", block_size, xlen),
));
}
let compressed_size = compressed_size as usize;
let mut compressed_data = vec![0u8; compressed_size];
self.inner.read_exact(&mut compressed_data)?;
let mut trailer = [0u8; 8];
self.inner.read_exact(&mut trailer)?;
self.decompressed_buf.clear();
self.decompressed_buf.reserve(BGZF_MAX_BLOCK_SIZE);
let mut decompress = Decompress::new(false);
decompress.decompress_vec(
&compressed_data,
&mut self.decompressed_buf,
flate2::FlushDecompress::Finish,
)?;
self.buf_pos = 0;
self.current_uncompressed_pos += self.decompressed_buf.len() as u64;
Ok(true)
}
fn fill_buf(&mut self) -> io::Result<&[u8]>
{
if self.buf_pos >= self.decompressed_buf.len()
{
if self.eof
{
return Ok(&[]);
}
if !self.read_next_block()?
{
return Ok(&[]);
}
}
Ok(&self.decompressed_buf[self.buf_pos..])
}
}
impl<R: Read + Seek> Read for BgzfReader<R>
{
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize>
{
let mut total_read = 0;
while total_read < buf.len()
{
let available = self.fill_buf()?;
if available.is_empty()
{
break;
}
let to_read = std::cmp::min(available.len(), buf.len() - total_read);
buf[total_read..total_read + to_read].copy_from_slice(&available[..to_read]);
self.buf_pos += to_read;
total_read += to_read;
}
Ok(total_read)
}
}
impl<R: Read + Seek> BufRead for BgzfReader<R>
{
fn fill_buf(&mut self) -> io::Result<&[u8]>
{
self.fill_buf()
}
fn consume(&mut self, amt: usize)
{
self.buf_pos += amt;
if self.buf_pos > self.decompressed_buf.len()
{
self.buf_pos = self.decompressed_buf.len();
}
}
}
#[cfg(test)]
mod tests
{
use super::*;
use std::io::Cursor;
#[test]
fn test_gzip_magic_verification()
{
assert_eq!(GZIP_ID1, 0x1f);
assert_eq!(GZIP_ID2, 0x8b);
}
#[test]
fn test_bgzf_constants()
{
assert_eq!(GZIP_CM_DEFLATE, 8);
assert_eq!(BGZF_EXTRA_ID, 66); assert_eq!(BGZF_EXTRA_SUBFIELD, 67); }
#[test]
fn test_reader_creation()
{
let data = b"not real gzip data";
let cursor = Cursor::new(data);
let reader = BgzfReader::new(cursor);
assert!(reader.gzi_index.is_none());
assert_eq!(reader.current_position(), 0);
}
}