1#![deny(unsafe_code)]
24#![allow(clippy::must_use_candidate, clippy::missing_errors_doc, clippy::missing_panics_doc)]
25
26mod reader;
28mod writer;
29pub use reader::*;
30pub use writer::*;
31
32use std::io;
33
34use byteorder::{ByteOrder, LittleEndian};
35use libdeflater::CompressionLvl;
36use thiserror::Error;
37
38mod buffer_ops {
40 #[inline(always)]
50 #[allow(unsafe_code, clippy::uninit_vec)]
51 pub(crate) unsafe fn resize_uninit(buffer: &mut Vec<u8>, new_len: usize) {
52 buffer.clear();
53 buffer.reserve_exact(new_len);
54 buffer.set_len(new_len);
55 }
56}
57
58pub const BGZF_BLOCK_SIZE: usize = 65280;
60
61pub const BUFSIZE: usize = 128 * 1024;
63
64pub(crate) const MAX_BGZF_BLOCK_SIZE: usize = 64 * 1024;
67
68pub(crate) static BGZF_EOF: &[u8] = &[
69 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ];
83
84pub(crate) const BGZF_HEADER_SIZE: usize = 18;
85pub(crate) const BGZF_FOOTER_SIZE: usize = 8;
86pub(crate) const BGZF_SIZEOF_CRC32: usize = 4;
87pub(crate) const BGZF_NAME_COMMENT_EXTRA_FLAG: u8 = 4;
88pub(crate) const BGZF_SUBFIELD_ID1: u8 = b'B';
89pub(crate) const BGZF_SUBFIELD_ID2: u8 = b'C';
90pub(crate) const BGZF_BLOCK_SIZE_OFFSET: usize = 16;
91pub(crate) const BGZF_XFL_OFFSET: usize = 8;
92
93pub(crate) const BGZF_COMPRESSION_HINT_BEST: u8 = 2;
94pub(crate) const BGZF_COMPRESSION_HINT_FASTEST: u8 = 4;
95pub(crate) const BGZF_COMPRESSION_HINT_OTHER: u8 = 0;
96
97const HEADER_TEMPLATE: [u8; BGZF_HEADER_SIZE] = [
99 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, b'B', b'C', 0x02, 0x00, 0x00, 0x00, ];
110
111type BgzfResult<T> = Result<T, BgzfError>;
112
113#[non_exhaustive]
114#[derive(Error, Debug)]
115pub enum BgzfError {
116 #[error("Compressed block size ({0}) exceeds max allowed: ({1})")]
117 BlockSizeExceeded(usize, usize),
118 #[error("Invalid compression level: {0}")]
119 CompressionLevel(u8),
120 #[error(transparent)]
121 Io(#[from] io::Error),
122 #[error("Invalid checksum, found {found}, expected {expected}")]
123 InvalidChecksum { found: u32, expected: u32 },
124 #[error("Invalid block header: {0}")]
125 InvalidHeader(&'static str),
126 #[error("LibDeflater compression error: {0:?}")]
127 LibDeflaterCompress(libdeflater::CompressionError),
128 #[error(transparent)]
129 LibDelfaterDecompress(#[from] libdeflater::DecompressionError),
130}
131
132#[derive(Debug, Copy, Clone)]
134struct ChecksumValues {
135 sum: u32,
137 amount: u32,
139}
140
141#[derive(Debug, Clone, Copy, PartialEq, Eq)]
145pub struct CompressionLevel(CompressionLvl);
146
147#[allow(dead_code)]
148impl CompressionLevel {
149 #[allow(clippy::cast_lossless)]
153 pub fn new(level: u8) -> BgzfResult<Self> {
154 Ok(Self(
156 CompressionLvl::new(level as i32).map_err(|_e| BgzfError::CompressionLevel(level))?,
157 ))
158 }
159
160 fn inner(&self) -> &libdeflater::CompressionLvl {
162 &self.0
163 }
164}
165
166impl TryFrom<u8> for CompressionLevel {
167 type Error = BgzfError;
168
169 fn try_from(value: u8) -> Result<Self, Self::Error> {
179 Self::new(value)
180 }
181}
182
183impl From<CompressionLevel> for u8 {
184 fn from(level: CompressionLevel) -> Self {
186 let inner: i32 = level.inner().into();
187 inner as u8
188 }
189}
190
191impl From<&CompressionLevel> for u8 {
192 fn from(level: &CompressionLevel) -> Self {
194 let inner: i32 = level.inner().into();
195 inner as u8
196 }
197}
198
199pub struct Compressor {
213 inner: libdeflater::Compressor,
214 level: CompressionLevel,
215}
216
217#[allow(dead_code)]
218impl Compressor {
219 #[must_use]
228 pub fn new(level: CompressionLevel) -> Self {
229 Self { inner: libdeflater::Compressor::new(*level.inner()), level }
230 }
231
232 #[inline]
233 fn inner(&self) -> &libdeflater::Compressor {
234 &self.inner
235 }
236
237 #[inline]
238 fn inner_mut(&mut self) -> &mut libdeflater::Compressor {
239 &mut self.inner
240 }
241
242 #[inline(always)]
244 pub fn compress(&mut self, input: &[u8], buffer: &mut Vec<u8>) -> BgzfResult<()> {
245 let compress_bound = self.inner_mut().deflate_compress_bound(input.len());
247 let required_size = BGZF_HEADER_SIZE + compress_bound + BGZF_FOOTER_SIZE;
248
249 #[allow(unsafe_code)]
255 unsafe {
256 buffer_ops::resize_uninit(buffer, required_size);
257 }
258
259 let bytes_written = self
260 .inner_mut()
261 .deflate_compress(input, &mut buffer[BGZF_HEADER_SIZE..])
262 .map_err(BgzfError::LibDeflaterCompress)?;
263
264 if bytes_written >= MAX_BGZF_BLOCK_SIZE {
265 return Err(BgzfError::BlockSizeExceeded(bytes_written, MAX_BGZF_BLOCK_SIZE));
266 }
267
268 let mut crc = libdeflater::Crc::new();
270 crc.update(input);
271
272 let header = header_inner(self.level, bytes_written as u16);
274 buffer[0..BGZF_HEADER_SIZE].copy_from_slice(&header);
275
276 let footer_offset = BGZF_HEADER_SIZE + bytes_written;
278 buffer[footer_offset..footer_offset + BGZF_SIZEOF_CRC32]
279 .copy_from_slice(&crc.sum().to_le_bytes());
280 buffer[footer_offset + BGZF_SIZEOF_CRC32..footer_offset + BGZF_FOOTER_SIZE]
281 .copy_from_slice(&(input.len() as u32).to_le_bytes());
282
283 buffer.truncate(footer_offset + BGZF_FOOTER_SIZE);
285
286 Ok(())
287 }
288
289 pub fn append_eof(bytes: &mut Vec<u8>) {
291 bytes.extend(BGZF_EOF);
292 }
293}
294
295struct Decompressor(libdeflater::Decompressor);
297
298#[allow(dead_code)]
299impl Decompressor {
300 fn new() -> Self {
302 Self(libdeflater::Decompressor::new())
303 }
304
305 #[inline]
306 fn inner(&self) -> &libdeflater::Decompressor {
307 &self.0
308 }
309
310 #[inline]
311 fn inner_mut(&mut self) -> &mut libdeflater::Decompressor {
312 &mut self.0
313 }
314
315 #[inline]
320 fn decompress(
321 &mut self,
322 input: &[u8],
323 output: &mut [u8],
324 checksum_values: ChecksumValues,
325 ) -> BgzfResult<()> {
326 if checksum_values.amount != 0 {
327 let _bytes_decompressed = self.inner_mut().deflate_decompress(input, output)?;
328 }
329 let mut new_check = libdeflater::Crc::new();
330 new_check.update(output);
331
332 if checksum_values.sum != new_check.sum() {
333 return Err(BgzfError::InvalidChecksum {
334 found: new_check.sum(),
335 expected: checksum_values.sum,
336 });
337 }
338 Ok(())
339 }
340}
341
342impl Default for Decompressor {
343 fn default() -> Self {
344 Self::new()
345 }
346}
347
348#[inline(always)]
350fn header_inner(
351 compression_level: CompressionLevel,
352 compressed_size: u16,
353) -> [u8; BGZF_HEADER_SIZE] {
354 let mut header = HEADER_TEMPLATE;
355
356 header[BGZF_XFL_OFFSET] = if compression_level.inner() >= &CompressionLvl::best() {
358 BGZF_COMPRESSION_HINT_BEST
359 } else if compression_level.inner() <= &CompressionLvl::fastest() {
360 BGZF_COMPRESSION_HINT_FASTEST
361 } else {
362 BGZF_COMPRESSION_HINT_OTHER
363 };
364
365 let bsize = compressed_size + BGZF_HEADER_SIZE as u16 + BGZF_FOOTER_SIZE as u16 - 1;
367 header[BGZF_BLOCK_SIZE_OFFSET..BGZF_BLOCK_SIZE_OFFSET + 2]
368 .copy_from_slice(&bsize.to_le_bytes());
369
370 header
371}
372
373#[inline]
375fn check_header(bytes: &[u8]) -> BgzfResult<()> {
376 if bytes[3] & 4 != BGZF_NAME_COMMENT_EXTRA_FLAG {
378 Err(BgzfError::InvalidHeader("Extra field flag not set"))
379 } else if bytes[12] != BGZF_SUBFIELD_ID1 || bytes[13] != BGZF_SUBFIELD_ID2 {
380 Err(BgzfError::InvalidHeader("Bad SID"))
382 } else {
383 Ok(())
384 }
385}
386
387#[inline]
389fn get_block_size(bytes: &[u8]) -> usize {
390 LittleEndian::read_u16(&bytes[BGZF_BLOCK_SIZE_OFFSET..]) as usize + 1
391}
392
393#[inline]
395fn get_footer_values(input: &[u8]) -> ChecksumValues {
396 let check_sum = LittleEndian::read_u32(&input[input.len() - 8..input.len() - 4]);
397 let check_amount = LittleEndian::read_u32(&input[input.len() - 4..]);
398 ChecksumValues { sum: check_sum, amount: check_amount }
399}
400
401#[inline]
403fn strip_footer(input: &[u8]) -> &[u8] {
404 &input[..input.len() - BGZF_FOOTER_SIZE]
405}
406
407#[cfg(test)]
408mod test {
409 use std::io::{Read, Write};
410 use std::{
411 fs::File,
412 io::{BufReader, BufWriter},
413 };
414
415 use proptest::prelude::*;
416 use tempfile::tempdir;
417
418 use super::*;
419
420 #[test]
423 fn test_eof_marker_written_once_with_finish() {
424 let mut output = Vec::new();
426 {
427 let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
428 writer.write_all(b"hello").unwrap();
429 writer.finish().unwrap();
430 }
431
432 assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
434
435 let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
437 assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
438 }
439
440 #[test]
442 fn test_eof_marker_written_once_on_drop() {
443 let mut output = Vec::new();
444 {
445 let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
446 writer.write_all(b"hello").unwrap();
447 }
449
450 assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
452
453 let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
455 assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
456 }
457
458 #[test]
460 fn test_eof_marker_empty_write() {
461 let mut output = Vec::new();
462 {
463 let writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
464 writer.finish().unwrap();
466 }
467
468 assert!(
470 output.ends_with(BGZF_EOF),
471 "Output should end with BGZF_EOF marker even with no data written"
472 );
473 assert_eq!(output.as_slice(), BGZF_EOF);
475 }
476
477 #[test]
479 fn test_multiple_flush_single_eof() {
480 let mut output = Vec::new();
481 {
482 let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
483 writer.write_all(b"hello").unwrap();
484 writer.flush().unwrap();
485 writer.write_all(b"world").unwrap();
486 writer.flush().unwrap();
487 writer.finish().unwrap();
488 }
489
490 assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
492
493 let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
495 assert_eq!(
496 eof_count, 1,
497 "EOF marker should appear exactly once even after multiple flush() calls"
498 );
499 }
500
501 #[test]
502 fn test_simple_bgzfsync() {
503 let dir = tempdir().unwrap();
504
505 let input = b"
507 This is a longer test than normal to come up with a bunch of text.
508 We'll read just a few lines at a time.
509 What if this is a longer string, does that then make
510 things fail?
511 ";
512
513 let orig_file = dir.path().join("orig.output.txt");
514 let mut orig_writer = BufWriter::new(File::create(&orig_file).unwrap());
515 orig_writer.write_all(input).unwrap();
516 drop(orig_writer);
517
518 let output_file = dir.path().join("output.txt");
520 let out_writer = BufWriter::new(File::create(&output_file).unwrap());
521
522 let mut bgzf = Writer::new(out_writer, CompressionLevel::new(3).unwrap());
524 bgzf.write_all(input).unwrap();
525 bgzf.finish().unwrap();
526
527 let mut reader = BufReader::new(File::open(output_file).unwrap());
529 let mut result = vec![];
530 reader.read_to_end(&mut result).unwrap();
531
532 let mut decoder = Reader::new(&result[..]);
534 let mut bytes = vec![];
535 decoder.read_to_end(&mut bytes).unwrap();
536
537 assert_eq!(input.to_vec(), bytes);
539 }
540
541 const DICT_SIZE: usize = 32768;
542 proptest! {
543 #[test]
544 fn proptest_bgzf(
545 input in prop::collection::vec(0..u8::MAX, 1..(DICT_SIZE * 10)),
546 buf_size in DICT_SIZE..BGZF_BLOCK_SIZE,
547 write_size in 1..BGZF_BLOCK_SIZE * 4,
548 comp_level in 1..12_u8
549 ) {
550 let dir = tempdir().unwrap();
551
552 let output_file = dir.path().join("output.txt");
554 let out_writer = BufWriter::new(File::create(&output_file).unwrap());
555
556 let mut writer = Writer::with_capacity(out_writer, CompressionLevel::new(comp_level).unwrap(), buf_size);
558
559 for chunk in input.chunks(write_size) {
560 writer.write_all(chunk).unwrap();
561 }
562 writer.finish().unwrap();
563
564 let mut reader = BufReader::new(File::open(output_file).unwrap());
566 let mut result = vec![];
567 reader.read_to_end(&mut result).unwrap();
568
569 let mut gz = Reader::new(&result[..]);
571 let mut bytes = vec![];
572 gz.read_to_end(&mut bytes).unwrap();
573
574 assert_eq!(input.clone(), bytes);
576 }
577 }
578}