1#![forbid(unsafe_code)]
24#![allow(clippy::must_use_candidate, clippy::missing_errors_doc, clippy::missing_panics_doc)]
25
26mod reader;
28mod writer;
29pub use reader::*;
30pub use writer::*;
31
32use std::io;
33
34use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
35use libdeflater::CompressionLvl;
36use thiserror::Error;
37
38pub const BGZF_BLOCK_SIZE: usize = 65280;
40
41pub const BUFSIZE: usize = 128 * 1024;
43
44pub(crate) const MAX_BGZF_BLOCK_SIZE: usize = 64 * 1024;
47
48pub(crate) static BGZF_EOF: &[u8] = &[
49 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ];
63
64pub(crate) const BGZF_HEADER_SIZE: usize = 18;
65pub(crate) const BGZF_FOOTER_SIZE: usize = 8;
66pub(crate) const BGZF_MAGIC_BYTE_A: u8 = 31;
67pub(crate) const BGZF_MAGIC_BYTE_B: u8 = 139;
68pub(crate) const BGZF_COMPRESSION_METHOD: u8 = 8;
69pub(crate) const BGZF_NAME_COMMENT_EXTRA_FLAG: u8 = 4;
70pub(crate) const BGZF_DEFAULT_MTIME: u32 = 0;
71pub(crate) const BGZF_DEFAULT_OS: u8 = 255;
72pub(crate) const BGZF_EXTRA_FLAG_LEN: u16 = 6;
73pub(crate) const BGZF_SUBFIELD_ID1: u8 = b'B';
74pub(crate) const BGZF_SUBFIELD_ID2: u8 = b'C';
75pub(crate) const BGZF_SUBFIELD_LEN: u16 = 2;
76pub(crate) const BGZF_BLOCK_SIZE_OFFSET: usize = 16;
77
78pub(crate) const BGZF_COMPRESSION_HINT_BEST: u8 = 2;
79pub(crate) const BGZF_COMPRESSION_HINT_FASTEST: u8 = 4;
80pub(crate) const BGZF_COMPRESSION_HINT_OTHER: u8 = 0;
81
82const EXTRA: f64 = 0.1;
83
84#[inline]
88fn extra_amount(input_len: usize) -> usize {
89 std::cmp::max(128, (input_len as f64 * EXTRA) as usize)
90}
91
92type BgzfResult<T> = Result<T, BgzfError>;
93
94#[non_exhaustive]
95#[derive(Error, Debug)]
96pub enum BgzfError {
97 #[error("Compressed block size ({0}) exceeds max allowed: ({1})")]
98 BlockSizeExceeded(usize, usize),
99 #[error("Invalid compression level: {0}")]
100 CompressionLevel(u8),
101 #[error(transparent)]
102 Io(#[from] io::Error),
103 #[error("Invalid checksum, found {found}, expected {expected}")]
104 InvalidChecksum { found: u32, expected: u32 },
105 #[error("Invalid block header: {0}")]
106 InvalidHeader(&'static str),
107 #[error("LibDeflater compression error: {0:?}")]
108 LibDeflaterCompress(libdeflater::CompressionError),
109 #[error(transparent)]
110 LibDelfaterDecompress(#[from] libdeflater::DecompressionError),
111}
112
113#[derive(Debug, Copy, Clone)]
115struct ChecksumValues {
116 sum: u32,
118 amount: u32,
120}
121
122#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub struct CompressionLevel(CompressionLvl);
127
128#[allow(dead_code)]
129impl CompressionLevel {
130 #[allow(clippy::cast_lossless)]
134 pub fn new(level: u8) -> BgzfResult<Self> {
135 Ok(Self(
137 CompressionLvl::new(level as i32).map_err(|_e| BgzfError::CompressionLevel(level))?,
138 ))
139 }
140
141 fn inner(&self) -> &libdeflater::CompressionLvl {
143 &self.0
144 }
145}
146
147impl TryFrom<u8> for CompressionLevel {
148 type Error = BgzfError;
149
150 fn try_from(value: u8) -> Result<Self, Self::Error> {
160 Self::new(value)
161 }
162}
163
164impl From<CompressionLevel> for u8 {
165 fn from(level: CompressionLevel) -> Self {
167 let inner: i32 = level.inner().into();
168 inner as u8
169 }
170}
171
172impl From<&CompressionLevel> for u8 {
173 fn from(level: &CompressionLevel) -> Self {
175 let inner: i32 = level.inner().into();
176 inner as u8
177 }
178}
179
180pub struct Compressor {
194 inner: libdeflater::Compressor,
195 level: CompressionLevel,
196}
197
198#[allow(dead_code)]
199impl Compressor {
200 pub fn new(level: CompressionLevel) -> Self {
209 Self { inner: libdeflater::Compressor::new(*level.inner()), level }
210 }
211
212 #[inline]
213 fn inner(&self) -> &libdeflater::Compressor {
214 &self.inner
215 }
216
217 #[inline]
218 fn inner_mut(&mut self) -> &mut libdeflater::Compressor {
219 &mut self.inner
220 }
221
222 #[inline]
224 pub fn compress(&mut self, input: &[u8], buffer: &mut Vec<u8>) -> BgzfResult<()> {
225 buffer.resize_with(
226 BGZF_HEADER_SIZE + input.len() + extra_amount(input.len()) + BGZF_FOOTER_SIZE,
227 || 0,
228 );
229
230 let bytes_written = self
231 .inner_mut()
232 .deflate_compress(input, &mut buffer[BGZF_HEADER_SIZE..])
233 .map_err(BgzfError::LibDeflaterCompress)?;
234
235 if bytes_written >= MAX_BGZF_BLOCK_SIZE {
237 return Err(BgzfError::BlockSizeExceeded(bytes_written, MAX_BGZF_BLOCK_SIZE));
238 }
239 let mut check = libdeflater::Crc::new();
240 check.update(input);
241
242 let header = header_inner(self.level, bytes_written as u16);
244 buffer[0..BGZF_HEADER_SIZE].copy_from_slice(&header);
245 buffer.truncate(BGZF_HEADER_SIZE + bytes_written);
246
247 buffer.write_u32::<LittleEndian>(check.sum())?;
248 buffer.write_u32::<LittleEndian>(input.len() as u32)?;
249
250 Ok(())
251 }
252
253 pub fn append_eof(bytes: &mut Vec<u8>) {
255 bytes.extend(BGZF_EOF);
256 }
257}
258
259struct Decompressor(libdeflater::Decompressor);
261
262#[allow(dead_code)]
263impl Decompressor {
264 fn new() -> Self {
266 Self(libdeflater::Decompressor::new())
267 }
268
269 #[inline]
270 fn inner(&self) -> &libdeflater::Decompressor {
271 &self.0
272 }
273
274 #[inline]
275 fn inner_mut(&mut self) -> &mut libdeflater::Decompressor {
276 &mut self.0
277 }
278
279 #[inline]
284 fn decompress(
285 &mut self,
286 input: &[u8],
287 output: &mut [u8],
288 checksum_values: ChecksumValues,
289 ) -> BgzfResult<()> {
290 if checksum_values.amount != 0 {
291 let _bytes_decompressed = self.inner_mut().deflate_decompress(input, output)?;
292 }
293 let mut new_check = libdeflater::Crc::new();
294 new_check.update(output);
295
296 if checksum_values.sum != new_check.sum() {
297 return Err(BgzfError::InvalidChecksum {
298 found: new_check.sum(),
299 expected: checksum_values.sum,
300 });
301 }
302 Ok(())
303 }
304}
305
306impl Default for Decompressor {
307 fn default() -> Self {
308 Self::new()
309 }
310}
311
312#[inline]
314fn header_inner(compression_level: CompressionLevel, compressed_size: u16) -> Vec<u8> {
315 let comp_value = if compression_level.inner() >= &CompressionLvl::best() {
318 BGZF_COMPRESSION_HINT_BEST
319 } else if compression_level.inner() <= &CompressionLvl::fastest() {
320 BGZF_COMPRESSION_HINT_FASTEST
321 } else {
322 BGZF_COMPRESSION_HINT_OTHER
323 };
324
325 let mut header: Vec<u8> = Vec::with_capacity(20);
326 header.write_u8(BGZF_MAGIC_BYTE_A).unwrap(); header.write_u8(BGZF_MAGIC_BYTE_B).unwrap(); header.write_u8(BGZF_COMPRESSION_METHOD).unwrap(); header.write_u8(BGZF_NAME_COMMENT_EXTRA_FLAG).unwrap(); header.write_u32::<LittleEndian>(BGZF_DEFAULT_MTIME).unwrap(); header.write_u8(comp_value).unwrap(); header.write_u8(BGZF_DEFAULT_OS).unwrap(); header.write_u16::<LittleEndian>(BGZF_EXTRA_FLAG_LEN).unwrap(); header.write_u8(BGZF_SUBFIELD_ID1).unwrap(); header.write_u8(BGZF_SUBFIELD_ID2).unwrap(); header.write_u16::<LittleEndian>(BGZF_SUBFIELD_LEN).unwrap(); header
338 .write_u16::<LittleEndian>(
339 compressed_size + BGZF_HEADER_SIZE as u16 + BGZF_FOOTER_SIZE as u16 - 1,
340 )
341 .unwrap(); header
344}
345
346#[inline]
348fn check_header(bytes: &[u8]) -> BgzfResult<()> {
349 if bytes[3] & 4 != BGZF_NAME_COMMENT_EXTRA_FLAG {
351 Err(BgzfError::InvalidHeader("Extra field flag not set"))
352 } else if bytes[12] != BGZF_SUBFIELD_ID1 || bytes[13] != BGZF_SUBFIELD_ID2 {
353 Err(BgzfError::InvalidHeader("Bad SID"))
355 } else {
356 Ok(())
357 }
358}
359
360#[inline]
362fn get_block_size(bytes: &[u8]) -> usize {
363 LittleEndian::read_u16(&bytes[BGZF_BLOCK_SIZE_OFFSET..]) as usize + 1
364}
365
366#[inline]
368fn get_footer_values(input: &[u8]) -> ChecksumValues {
369 let check_sum = LittleEndian::read_u32(&input[input.len() - 8..input.len() - 4]);
370 let check_amount = LittleEndian::read_u32(&input[input.len() - 4..]);
371 ChecksumValues { sum: check_sum, amount: check_amount }
372}
373
374#[inline]
376fn strip_footer(input: &[u8]) -> &[u8] {
377 &input[..input.len() - BGZF_FOOTER_SIZE]
378}
379
380#[cfg(test)]
381mod test {
382 use std::io::{Read, Write};
383 use std::{
384 fs::File,
385 io::{BufReader, BufWriter},
386 };
387
388 use proptest::prelude::*;
389 use tempfile::tempdir;
390
391 use super::*;
392
393 #[test]
394 fn test_simple_bgzfsync() {
395 let dir = tempdir().unwrap();
396
397 let input = b"
399 This is a longer test than normal to come up with a bunch of text.
400 We'll read just a few lines at a time.
401 What if this is a longer string, does that then make
402 things fail?
403 ";
404
405 let orig_file = dir.path().join("orig.output.txt");
406 let mut orig_writer = BufWriter::new(File::create(&orig_file).unwrap());
407 orig_writer.write_all(input).unwrap();
408 drop(orig_writer);
409
410 let output_file = dir.path().join("output.txt");
412 let out_writer = BufWriter::new(File::create(&output_file).unwrap());
413
414 let mut bgzf = Writer::new(out_writer, CompressionLevel::new(3).unwrap());
416 bgzf.write_all(input).unwrap();
417 bgzf.flush().unwrap();
418 drop(bgzf);
419
420 let mut reader = BufReader::new(File::open(output_file).unwrap());
422 let mut result = vec![];
423 reader.read_to_end(&mut result).unwrap();
424
425 let mut decoder = Reader::new(&result[..]);
427 let mut bytes = vec![];
428 decoder.read_to_end(&mut bytes).unwrap();
429
430 assert_eq!(input.to_vec(), bytes);
432 }
433
434 const DICT_SIZE: usize = 32768;
435 proptest! {
436 #[test]
437 fn proptest_bgzf(
438 input in prop::collection::vec(0..u8::MAX, 1..(DICT_SIZE * 10)),
439 buf_size in DICT_SIZE..BGZF_BLOCK_SIZE,
440 write_size in 1..BGZF_BLOCK_SIZE * 4,
441 comp_level in 1..12_u8
442 ) {
443 let dir = tempdir().unwrap();
444
445 let output_file = dir.path().join("output.txt");
447 let out_writer = BufWriter::new(File::create(&output_file).unwrap());
448
449 let mut writer = Writer::with_capacity(out_writer, CompressionLevel::new(comp_level).unwrap(), buf_size);
451
452 for chunk in input.chunks(write_size) {
453 writer.write_all(chunk).unwrap();
454 }
455 writer.flush().unwrap();
456 drop(writer);
457
458 let mut reader = BufReader::new(File::open(output_file).unwrap());
460 let mut result = vec![];
461 reader.read_to_end(&mut result).unwrap();
462
463 let mut gz = Reader::new(&result[..]);
465 let mut bytes = vec![];
466 gz.read_to_end(&mut bytes).unwrap();
467
468 assert_eq!(input.clone(), bytes);
470 }
471 }
472}