#![deny(unsafe_code)]
#![allow(clippy::must_use_candidate, clippy::missing_errors_doc, clippy::missing_panics_doc)]
mod reader;
mod writer;
pub use reader::*;
pub use writer::*;
use std::io;
use byteorder::{ByteOrder, LittleEndian};
use libdeflater::CompressionLvl;
use thiserror::Error;
mod buffer_ops {
#[inline(always)]
#[allow(unsafe_code, clippy::uninit_vec)]
pub(crate) unsafe fn resize_uninit(buffer: &mut Vec<u8>, new_len: usize) {
buffer.clear();
buffer.reserve_exact(new_len);
buffer.set_len(new_len);
}
}
pub const BGZF_BLOCK_SIZE: usize = 65280;
pub const BUFSIZE: usize = 128 * 1024;
pub(crate) const MAX_BGZF_BLOCK_SIZE: usize = 64 * 1024;
pub(crate) static BGZF_EOF: &[u8] = &[
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ];
pub(crate) const BGZF_HEADER_SIZE: usize = 18;
pub(crate) const BGZF_FOOTER_SIZE: usize = 8;
pub(crate) const BGZF_SIZEOF_CRC32: usize = 4;
pub(crate) const BGZF_NAME_COMMENT_EXTRA_FLAG: u8 = 4;
pub(crate) const BGZF_SUBFIELD_ID1: u8 = b'B';
pub(crate) const BGZF_SUBFIELD_ID2: u8 = b'C';
pub(crate) const BGZF_BLOCK_SIZE_OFFSET: usize = 16;
pub(crate) const BGZF_XFL_OFFSET: usize = 8;
pub(crate) const BGZF_COMPRESSION_HINT_BEST: u8 = 2;
pub(crate) const BGZF_COMPRESSION_HINT_FASTEST: u8 = 4;
pub(crate) const BGZF_COMPRESSION_HINT_OTHER: u8 = 0;
const HEADER_TEMPLATE: [u8; BGZF_HEADER_SIZE] = [
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, b'B', b'C', 0x02, 0x00, 0x00, 0x00, ];
type BgzfResult<T> = Result<T, BgzfError>;
#[non_exhaustive]
#[derive(Error, Debug)]
pub enum BgzfError {
#[error("Compressed block size ({0}) exceeds max allowed: ({1})")]
BlockSizeExceeded(usize, usize),
#[error("Invalid compression level: {0}")]
CompressionLevel(u8),
#[error(transparent)]
Io(#[from] io::Error),
#[error("Invalid checksum, found {found}, expected {expected}")]
InvalidChecksum { found: u32, expected: u32 },
#[error("Invalid block header: {0}")]
InvalidHeader(&'static str),
#[error("LibDeflater compression error: {0:?}")]
LibDeflaterCompress(libdeflater::CompressionError),
#[error(transparent)]
LibDelfaterDecompress(#[from] libdeflater::DecompressionError),
}
#[derive(Debug, Copy, Clone)]
struct ChecksumValues {
sum: u32,
amount: u32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct CompressionLevel(CompressionLvl);
#[allow(dead_code)]
impl CompressionLevel {
#[allow(clippy::cast_lossless)]
pub fn new(level: u8) -> BgzfResult<Self> {
Ok(Self(
CompressionLvl::new(level as i32).map_err(|_e| BgzfError::CompressionLevel(level))?,
))
}
fn inner(&self) -> &libdeflater::CompressionLvl {
&self.0
}
}
impl TryFrom<u8> for CompressionLevel {
type Error = BgzfError;
fn try_from(value: u8) -> Result<Self, Self::Error> {
Self::new(value)
}
}
impl From<CompressionLevel> for u8 {
fn from(level: CompressionLevel) -> Self {
let inner: i32 = level.inner().into();
inner as u8
}
}
impl From<&CompressionLevel> for u8 {
fn from(level: &CompressionLevel) -> Self {
let inner: i32 = level.inner().into();
inner as u8
}
}
pub struct Compressor {
inner: libdeflater::Compressor,
level: CompressionLevel,
}
#[allow(dead_code)]
impl Compressor {
#[must_use]
pub fn new(level: CompressionLevel) -> Self {
Self { inner: libdeflater::Compressor::new(*level.inner()), level }
}
#[inline]
fn inner(&self) -> &libdeflater::Compressor {
&self.inner
}
#[inline]
fn inner_mut(&mut self) -> &mut libdeflater::Compressor {
&mut self.inner
}
#[inline(always)]
pub fn compress(&mut self, input: &[u8], buffer: &mut Vec<u8>) -> BgzfResult<()> {
let compress_bound = self.inner_mut().deflate_compress_bound(input.len());
let required_size = BGZF_HEADER_SIZE + compress_bound + BGZF_FOOTER_SIZE;
#[allow(unsafe_code)]
unsafe {
buffer_ops::resize_uninit(buffer, required_size);
}
let bytes_written = self
.inner_mut()
.deflate_compress(input, &mut buffer[BGZF_HEADER_SIZE..])
.map_err(BgzfError::LibDeflaterCompress)?;
if bytes_written >= MAX_BGZF_BLOCK_SIZE {
return Err(BgzfError::BlockSizeExceeded(bytes_written, MAX_BGZF_BLOCK_SIZE));
}
let mut crc = libdeflater::Crc::new();
crc.update(input);
let header = header_inner(self.level, bytes_written as u16);
buffer[0..BGZF_HEADER_SIZE].copy_from_slice(&header);
let footer_offset = BGZF_HEADER_SIZE + bytes_written;
buffer[footer_offset..footer_offset + BGZF_SIZEOF_CRC32]
.copy_from_slice(&crc.sum().to_le_bytes());
buffer[footer_offset + BGZF_SIZEOF_CRC32..footer_offset + BGZF_FOOTER_SIZE]
.copy_from_slice(&(input.len() as u32).to_le_bytes());
buffer.truncate(footer_offset + BGZF_FOOTER_SIZE);
Ok(())
}
pub fn append_eof(bytes: &mut Vec<u8>) {
bytes.extend(BGZF_EOF);
}
}
struct Decompressor(libdeflater::Decompressor);
#[allow(dead_code)]
impl Decompressor {
fn new() -> Self {
Self(libdeflater::Decompressor::new())
}
#[inline]
fn inner(&self) -> &libdeflater::Decompressor {
&self.0
}
#[inline]
fn inner_mut(&mut self) -> &mut libdeflater::Decompressor {
&mut self.0
}
#[inline]
fn decompress(
&mut self,
input: &[u8],
output: &mut [u8],
checksum_values: ChecksumValues,
) -> BgzfResult<()> {
if checksum_values.amount != 0 {
let _bytes_decompressed = self.inner_mut().deflate_decompress(input, output)?;
}
let mut new_check = libdeflater::Crc::new();
new_check.update(output);
if checksum_values.sum != new_check.sum() {
return Err(BgzfError::InvalidChecksum {
found: new_check.sum(),
expected: checksum_values.sum,
});
}
Ok(())
}
}
impl Default for Decompressor {
fn default() -> Self {
Self::new()
}
}
#[inline(always)]
fn header_inner(
compression_level: CompressionLevel,
compressed_size: u16,
) -> [u8; BGZF_HEADER_SIZE] {
let mut header = HEADER_TEMPLATE;
header[BGZF_XFL_OFFSET] = if compression_level.inner() >= &CompressionLvl::best() {
BGZF_COMPRESSION_HINT_BEST
} else if compression_level.inner() <= &CompressionLvl::fastest() {
BGZF_COMPRESSION_HINT_FASTEST
} else {
BGZF_COMPRESSION_HINT_OTHER
};
let bsize = compressed_size + BGZF_HEADER_SIZE as u16 + BGZF_FOOTER_SIZE as u16 - 1;
header[BGZF_BLOCK_SIZE_OFFSET..BGZF_BLOCK_SIZE_OFFSET + 2]
.copy_from_slice(&bsize.to_le_bytes());
header
}
#[inline]
fn check_header(bytes: &[u8]) -> BgzfResult<()> {
if bytes[3] & 4 != BGZF_NAME_COMMENT_EXTRA_FLAG {
Err(BgzfError::InvalidHeader("Extra field flag not set"))
} else if bytes[12] != BGZF_SUBFIELD_ID1 || bytes[13] != BGZF_SUBFIELD_ID2 {
Err(BgzfError::InvalidHeader("Bad SID"))
} else {
Ok(())
}
}
#[inline]
fn get_block_size(bytes: &[u8]) -> usize {
LittleEndian::read_u16(&bytes[BGZF_BLOCK_SIZE_OFFSET..]) as usize + 1
}
#[inline]
fn get_footer_values(input: &[u8]) -> ChecksumValues {
let check_sum = LittleEndian::read_u32(&input[input.len() - 8..input.len() - 4]);
let check_amount = LittleEndian::read_u32(&input[input.len() - 4..]);
ChecksumValues { sum: check_sum, amount: check_amount }
}
#[inline]
fn strip_footer(input: &[u8]) -> &[u8] {
&input[..input.len() - BGZF_FOOTER_SIZE]
}
#[cfg(test)]
mod test {
use std::io::{Read, Write};
use std::{
fs::File,
io::{BufReader, BufWriter},
};
use proptest::prelude::*;
use tempfile::tempdir;
use super::*;
#[test]
fn test_eof_marker_written_once_with_finish() {
let mut output = Vec::new();
{
let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
writer.write_all(b"hello").unwrap();
writer.finish().unwrap();
}
assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
}
#[test]
fn test_eof_marker_written_once_on_drop() {
let mut output = Vec::new();
{
let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
writer.write_all(b"hello").unwrap();
}
assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
}
#[test]
fn test_eof_marker_empty_write() {
let mut output = Vec::new();
{
let writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
writer.finish().unwrap();
}
assert!(
output.ends_with(BGZF_EOF),
"Output should end with BGZF_EOF marker even with no data written"
);
assert_eq!(output.as_slice(), BGZF_EOF);
}
#[test]
fn test_multiple_flush_single_eof() {
let mut output = Vec::new();
{
let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
writer.write_all(b"hello").unwrap();
writer.flush().unwrap();
writer.write_all(b"world").unwrap();
writer.flush().unwrap();
writer.finish().unwrap();
}
assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
assert_eq!(
eof_count, 1,
"EOF marker should appear exactly once even after multiple flush() calls"
);
}
#[test]
fn test_simple_bgzfsync() {
let dir = tempdir().unwrap();
let input = b"
This is a longer test than normal to come up with a bunch of text.
We'll read just a few lines at a time.
What if this is a longer string, does that then make
things fail?
";
let orig_file = dir.path().join("orig.output.txt");
let mut orig_writer = BufWriter::new(File::create(&orig_file).unwrap());
orig_writer.write_all(input).unwrap();
drop(orig_writer);
let output_file = dir.path().join("output.txt");
let out_writer = BufWriter::new(File::create(&output_file).unwrap());
let mut bgzf = Writer::new(out_writer, CompressionLevel::new(3).unwrap());
bgzf.write_all(input).unwrap();
bgzf.finish().unwrap();
let mut reader = BufReader::new(File::open(output_file).unwrap());
let mut result = vec![];
reader.read_to_end(&mut result).unwrap();
let mut decoder = Reader::new(&result[..]);
let mut bytes = vec![];
decoder.read_to_end(&mut bytes).unwrap();
assert_eq!(input.to_vec(), bytes);
}
const DICT_SIZE: usize = 32768;
proptest! {
#[test]
fn proptest_bgzf(
input in prop::collection::vec(0..u8::MAX, 1..(DICT_SIZE * 10)),
buf_size in DICT_SIZE..BGZF_BLOCK_SIZE,
write_size in 1..BGZF_BLOCK_SIZE * 4,
comp_level in 1..12_u8
) {
let dir = tempdir().unwrap();
let output_file = dir.path().join("output.txt");
let out_writer = BufWriter::new(File::create(&output_file).unwrap());
let mut writer = Writer::with_capacity(out_writer, CompressionLevel::new(comp_level).unwrap(), buf_size);
for chunk in input.chunks(write_size) {
writer.write_all(chunk).unwrap();
}
writer.finish().unwrap();
let mut reader = BufReader::new(File::open(output_file).unwrap());
let mut result = vec![];
reader.read_to_end(&mut result).unwrap();
let mut gz = Reader::new(&result[..]);
let mut bytes = vec![];
gz.read_to_end(&mut bytes).unwrap();
assert_eq!(input.clone(), bytes);
}
}
}