use super::bgzf::{BgzfError, VirtualOffset};
use std::io::Write;
use tracing::warn;
const MAX_UNCOMPRESSED_SIZE: usize = 65536;
const BGZF_HEADER: [u8; 18] = [
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x00, 0x00, ];
const EOF_BLOCK: [u8; 28] = [
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00,
0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ];
pub struct BgzfWriter<W: Write> {
inner: Option<W>,
buf: Vec<u8>,
compressed_buf: Vec<u8>,
compressor: libdeflater::Compressor,
block_offset: u64,
}
impl<W: Write> BgzfWriter<W> {
pub fn new(inner: W) -> Self {
Self::with_compression_level(inner, 6)
}
pub fn with_compression_level(inner: W, level: i32) -> Self {
Self {
inner: Some(inner),
buf: Vec::with_capacity(MAX_UNCOMPRESSED_SIZE),
compressed_buf: Vec::with_capacity(MAX_UNCOMPRESSED_SIZE),
compressor: libdeflater::Compressor::new(
libdeflater::CompressionLvl::new(level).unwrap_or_default(),
),
block_offset: 0,
}
}
fn writer(&mut self) -> Result<&mut W, BgzfError> {
self.inner.as_mut().ok_or(BgzfError::AlreadyFinished)
}
#[expect(
clippy::same_name_method,
reason = "inherent method is the concrete impl; BgzfWrite trait delegates to it for dyn dispatch"
)]
pub fn virtual_offset(&self) -> VirtualOffset {
debug_assert!(
self.buf.len() <= MAX_UNCOMPRESSED_SIZE,
"buffer overflow: {} bytes (should never happen)",
self.buf.len()
);
#[allow(clippy::cast_possible_truncation, reason = "min")]
let within = self.buf.len().min(u16::MAX as usize) as u16;
VirtualOffset::new(self.block_offset, within)
}
#[expect(
clippy::same_name_method,
reason = "inherent method is the concrete impl; BgzfWrite trait delegates to it for dyn dispatch"
)]
pub fn flush_if_needed(&mut self, upcoming_bytes: usize) -> Result<(), BgzfError> {
if self.buf.len().saturating_add(upcoming_bytes) > MAX_UNCOMPRESSED_SIZE {
self.flush_block()?;
}
Ok(())
}
#[expect(
clippy::same_name_method,
reason = "inherent method is the concrete impl; BgzfWrite trait delegates to it for dyn dispatch"
)]
pub fn write_all(&mut self, data: &[u8]) -> Result<(), BgzfError> {
let mut remaining = data;
while !remaining.is_empty() {
let space = MAX_UNCOMPRESSED_SIZE.saturating_sub(self.buf.len());
if space == 0 {
self.flush_block()?;
continue;
}
let take = remaining.len().min(space);
#[allow(clippy::indexing_slicing, reason = "take <= remaining.len()")]
{
self.buf.extend_from_slice(&remaining[..take]);
remaining = &remaining[take..];
}
}
Ok(())
}
fn flush_block(&mut self) -> Result<(), BgzfError> {
if self.buf.is_empty() {
return Ok(());
}
let max_compressed = self.compressor.deflate_compress_bound(self.buf.len());
self.compressed_buf.clear();
self.compressed_buf.resize(max_compressed, 0);
let compressed_len = self
.compressor
.deflate_compress(&self.buf, &mut self.compressed_buf)
.map_err(|source| BgzfError::CompressionFailed { source })?;
self.compressed_buf.truncate(compressed_len);
let mut crc = libdeflater::Crc::new();
crc.update(&self.buf);
let crc32 = crc.sum();
let total_block_size = 18usize
.checked_add(compressed_len)
.and_then(|n| n.checked_add(8))
.ok_or(BgzfError::CorruptHeader)?;
let bsize = total_block_size.checked_sub(1).ok_or(BgzfError::CorruptHeader)?;
let mut header = BGZF_HEADER;
let bsize_bytes = u16::try_from(bsize).map_err(|_| BgzfError::CorruptHeader)?.to_le_bytes();
#[allow(clippy::indexing_slicing, reason = "fixed-size header with known offsets")]
{
header[16] = bsize_bytes[0];
header[17] = bsize_bytes[1];
}
let isize_val = u32::try_from(self.buf.len()).map_err(|_| BgzfError::CorruptHeader)?;
let w = self.inner.as_mut().ok_or(BgzfError::AlreadyFinished)?;
w.write_all(&header).map_err(|source| BgzfError::WriteFailed { source })?;
w.write_all(&self.compressed_buf).map_err(|source| BgzfError::WriteFailed { source })?;
w.write_all(&crc32.to_le_bytes()).map_err(|source| BgzfError::WriteFailed { source })?;
w.write_all(&isize_val.to_le_bytes())
.map_err(|source| BgzfError::WriteFailed { source })?;
self.block_offset = self
.block_offset
.checked_add(total_block_size as u64)
.ok_or(BgzfError::CorruptHeader)?;
self.buf.clear();
Ok(())
}
pub fn finish(mut self) -> Result<W, BgzfError> {
self.flush_block()?;
let w = self.writer()?;
w.write_all(&EOF_BLOCK).map_err(|source| BgzfError::WriteFailed { source })?;
w.flush().map_err(|source| BgzfError::WriteFailed { source })?;
self.inner.take().ok_or(BgzfError::AlreadyFinished)
}
}
impl<W: Write> Drop for BgzfWriter<W> {
fn drop(&mut self) {
if !self.buf.is_empty()
&& self.inner.is_some()
&& let Err(e) = self.flush_block()
{
warn!("BgzfWriter::drop: failed to flush {} buffered bytes: {e}", self.buf.len());
}
}
}
#[allow(clippy::cast_possible_truncation, reason = "tests")]
#[allow(clippy::indexing_slicing, reason = "tests")]
#[cfg(test)]
mod tests {
use super::*;
use crate::bam::bgzf::BgzfReader;
use std::io::Cursor;
fn write_and_finish(data: &[u8]) -> Vec<u8> {
let mut output = Vec::new();
let mut writer = BgzfWriter::new(&mut output);
writer.write_all(data).unwrap();
writer.finish().unwrap();
output
}
fn read_all(compressed: &[u8]) -> Vec<u8> {
let mut reader = BgzfReader::from_reader(Cursor::new(compressed));
let mut result = Vec::new();
reader.read_to_end(&mut result).unwrap();
result
}
#[test]
fn round_trip_small() {
let data = b"Hello, BGZF world!";
let output = write_and_finish(data);
assert_eq!(read_all(&output), data);
}
#[test]
fn round_trip_exact_block_boundary() {
let data: Vec<u8> = (0..MAX_UNCOMPRESSED_SIZE).map(|i| (i & 0xFF) as u8).collect();
let output = write_and_finish(&data);
assert_eq!(read_all(&output), data);
}
#[test]
fn round_trip_spanning_multiple_blocks() {
let data: Vec<u8> = (0..200_000).map(|i| (i % 251) as u8).collect();
let output = write_and_finish(&data);
assert_eq!(read_all(&output), data);
}
#[test]
fn eof_marker_present() {
let mut output = Vec::new();
let writer = BgzfWriter::new(&mut output);
writer.finish().unwrap();
assert!(output.len() >= 28);
assert_eq!(&output[output.len() - 28..], &EOF_BLOCK);
}
#[test]
fn virtual_offset_tracking() {
let mut output = Vec::new();
let mut writer = BgzfWriter::new(&mut output);
let voff0 = writer.virtual_offset();
assert_eq!(voff0.block_offset(), 0);
assert_eq!(voff0.within_block(), 0);
writer.write_all(b"test data").unwrap();
let voff1 = writer.virtual_offset();
assert_eq!(voff1.block_offset(), 0);
assert_eq!(voff1.within_block(), 9);
let fill = vec![0u8; MAX_UNCOMPRESSED_SIZE - 9];
writer.write_all(&fill).unwrap();
writer.write_all(&[0x42]).unwrap();
let voff2 = writer.virtual_offset();
assert!(voff2.block_offset() > 0, "should have advanced to a new block");
assert_eq!(voff2.within_block(), 1, "one byte in the new block");
writer.finish().unwrap();
}
#[test]
fn flush_if_needed_triggers_flush() {
let mut output = Vec::new();
let mut writer = BgzfWriter::new(&mut output);
writer.write_all(&vec![0u8; 60_000]).unwrap();
assert_eq!(writer.virtual_offset().block_offset(), 0);
writer.flush_if_needed(10_000).unwrap();
assert!(writer.virtual_offset().block_offset() > 0);
assert_eq!(writer.virtual_offset().within_block(), 0);
writer.finish().unwrap();
}
#[test]
fn flush_if_needed_no_flush_when_fits() {
let mut output = Vec::new();
let mut writer = BgzfWriter::new(&mut output);
writer.write_all(&vec![0u8; 1000]).unwrap();
let before = writer.virtual_offset();
writer.flush_if_needed(100).unwrap();
let after = writer.virtual_offset();
assert_eq!(before, after, "should not have flushed");
writer.finish().unwrap();
}
#[test]
fn compression_level_configurable() {
let data = b"compression test data repeated many times for better ratio ";
let repeated: Vec<u8> = data.iter().copied().cycle().take(10_000).collect();
let mut out1 = Vec::new();
let mut w1 = BgzfWriter::with_compression_level(&mut out1, 1);
w1.write_all(&repeated).unwrap();
w1.finish().unwrap();
let mut out9 = Vec::new();
let mut w9 = BgzfWriter::with_compression_level(&mut out9, 9);
w9.write_all(&repeated).unwrap();
w9.finish().unwrap();
assert!(out9.len() <= out1.len());
assert_eq!(read_all(&out1), repeated);
assert_eq!(read_all(&out9), repeated);
}
#[test]
fn empty_write_produces_only_eof() {
let mut output = Vec::new();
let writer = BgzfWriter::new(&mut output);
writer.finish().unwrap();
assert_eq!(output.len(), 28);
}
}