Skip to main content

bgzf/
lib.rs

1//! This library provides both high level readers and writers for the BGZF format as well as lower level
2//! compressor and decompressor functions.
3//!
4//! Bgzf is a multi-gzip format that adds an extra field to the header indicating how large the
5//! complete block (with header and footer) is.
6//!
7//! # Examples
8//!
9//! ```rust
10//! use bgzf::{Reader, Writer};
11//! use std::error::Error;
12//! use std::io;
13//!
14//! /// Contrived example that decompresses stdin and compresses to stdout.
15//! fn main() -> Result<(), Box<dyn Error>> {
16//!     let mut reader = Reader::new(io::stdin());
17//!     let mut writer = Writer::new(io::stdout(), 2.try_into()?);
18//!     let total_bytes = io::copy(&mut reader, &mut writer)?;
19//!     eprintln!("{} uncompressed bytes", total_bytes);
20//!     Ok(())
21//! }
22//! ```
23#![deny(unsafe_code)]
24#![allow(clippy::must_use_candidate, clippy::missing_errors_doc, clippy::missing_panics_doc)]
25
26// Re-export the reader and writer to the same level.
27mod reader;
28mod writer;
29pub use reader::*;
30pub use writer::*;
31
32use std::io;
33
34use byteorder::{ByteOrder, LittleEndian};
35use libdeflater::CompressionLvl;
36use thiserror::Error;
37
38/// Buffer operations that avoid unnecessary memory initialization.
39mod buffer_ops {
40    /// Resizes a buffer to `new_len` without initializing the new bytes.
41    ///
42    /// # Safety
43    ///
44    /// The caller must ensure that all bytes in `0..new_len` are written
45    /// before any of them are read. This is safe because:
46    /// - `u8` has no invalid bit patterns
47    /// - `reserve_exact()` ensures sufficient capacity
48    /// - The buffer is cleared first, so no stale data remains
49    #[inline(always)]
50    #[allow(unsafe_code, clippy::uninit_vec)]
51    pub(crate) unsafe fn resize_uninit(buffer: &mut Vec<u8>, new_len: usize) {
52        buffer.clear();
53        buffer.reserve_exact(new_len);
54        buffer.set_len(new_len);
55    }
56}
57
58/// The maximum uncompressed blocksize for BGZF compression (taken from bgzip), used for initializing blocks.
59pub const BGZF_BLOCK_SIZE: usize = 65280;
60
61/// 128 KB default buffer size, same as pigz.
62pub const BUFSIZE: usize = 128 * 1024;
63
64/// Default from bgzf: compress(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
65/// 65536 which is u16::MAX + 1
66pub(crate) const MAX_BGZF_BLOCK_SIZE: usize = 64 * 1024;
67
68pub(crate) static BGZF_EOF: &[u8] = &[
69    0x1f, 0x8b, // ID1, ID2
70    0x08, // CM = DEFLATE
71    0x04, // FLG = FEXTRA
72    0x00, 0x00, 0x00, 0x00, // MTIME = 0
73    0x00, // XFL = 0
74    0xff, // OS = 255 (unknown)
75    0x06, 0x00, // XLEN = 6
76    0x42, 0x43, // SI1, SI2
77    0x02, 0x00, // SLEN = 2
78    0x1b, 0x00, // BSIZE = 27
79    0x03, 0x00, // CDATA
80    0x00, 0x00, 0x00, 0x00, // CRC32 = 0x00000000
81    0x00, 0x00, 0x00, 0x00, // ISIZE = 0
82];
83
84pub(crate) const BGZF_HEADER_SIZE: usize = 18;
85pub(crate) const BGZF_FOOTER_SIZE: usize = 8;
86pub(crate) const BGZF_SIZEOF_CRC32: usize = 4;
87pub(crate) const BGZF_NAME_COMMENT_EXTRA_FLAG: u8 = 4;
88pub(crate) const BGZF_SUBFIELD_ID1: u8 = b'B';
89pub(crate) const BGZF_SUBFIELD_ID2: u8 = b'C';
90pub(crate) const BGZF_BLOCK_SIZE_OFFSET: usize = 16;
91pub(crate) const BGZF_XFL_OFFSET: usize = 8;
92
93pub(crate) const BGZF_COMPRESSION_HINT_BEST: u8 = 2;
94pub(crate) const BGZF_COMPRESSION_HINT_FASTEST: u8 = 4;
95pub(crate) const BGZF_COMPRESSION_HINT_OTHER: u8 = 0;
96
97/// Pre-computed BGZF header template. Only bytes 8 (XFL) and 16-17 (BSIZE) vary.
98const HEADER_TEMPLATE: [u8; BGZF_HEADER_SIZE] = [
99    0x1f, 0x8b, // ID1, ID2 (magic)
100    0x08, // CM = DEFLATE
101    0x04, // FLG = FEXTRA
102    0x00, 0x00, 0x00, 0x00, // MTIME = 0
103    0x00, // XFL = placeholder (byte 8)
104    0xff, // OS = 255
105    0x06, 0x00, // XLEN = 6
106    b'B', b'C', // SI1, SI2
107    0x02, 0x00, // SLEN = 2
108    0x00, 0x00, // BSIZE placeholder (bytes 16-17)
109];
110
111type BgzfResult<T> = Result<T, BgzfError>;
112
113#[non_exhaustive]
114#[derive(Error, Debug)]
115pub enum BgzfError {
116    #[error("Compressed block size ({0}) exceeds max allowed: ({1})")]
117    BlockSizeExceeded(usize, usize),
118    #[error("Invalid compression level: {0}")]
119    CompressionLevel(u8),
120    #[error(transparent)]
121    Io(#[from] io::Error),
122    #[error("Invalid checksum, found {found}, expected {expected}")]
123    InvalidChecksum { found: u32, expected: u32 },
124    #[error("Invalid block header: {0}")]
125    InvalidHeader(&'static str),
126    #[error("LibDeflater compression error: {0:?}")]
127    LibDeflaterCompress(libdeflater::CompressionError),
128    #[error(transparent)]
129    LibDelfaterDecompress(#[from] libdeflater::DecompressionError),
130}
131
132/// The expected checksum and number of bytes for decompressed data.
133#[derive(Debug, Copy, Clone)]
134struct ChecksumValues {
135    /// The check sum
136    sum: u32,
137    /// The number of bytes that went into the sum
138    amount: u32,
139}
140
141/// Level of compression to use for for the compressors.
142///
143/// Valid values are 1-12. See [libdeflater](https://github.com/ebiggers/libdeflate#compression-levels) documentation on levels.
144#[derive(Debug, Clone, Copy, PartialEq, Eq)]
145pub struct CompressionLevel(CompressionLvl);
146
147#[allow(dead_code)]
148impl CompressionLevel {
149    /// Create a new [`CompressionLevel`] instance.
150    ///
151    /// Valid levels are 1-12.
152    #[allow(clippy::cast_lossless)]
153    pub fn new(level: u8) -> BgzfResult<Self> {
154        // libdeflater::CompressionLvlError contains no information
155        Ok(Self(
156            CompressionLvl::new(level as i32).map_err(|_e| BgzfError::CompressionLevel(level))?,
157        ))
158    }
159
160    /// Get the inner compression level
161    fn inner(&self) -> &libdeflater::CompressionLvl {
162        &self.0
163    }
164}
165
166impl TryFrom<u8> for CompressionLevel {
167    type Error = BgzfError;
168
169    /// Try to convert a `u8` to a compression level.
170    ///
171    /// # Example
172    /// ```rust
173    /// use bgzf::CompressionLevel;
174    ///
175    /// let level: CompressionLevel = 2.try_into().unwrap();
176    /// assert_eq!(level, CompressionLevel::new(2).unwrap());
177    /// ```
178    fn try_from(value: u8) -> Result<Self, Self::Error> {
179        Self::new(value)
180    }
181}
182
183impl From<CompressionLevel> for u8 {
184    /// Convenience method vor converting [`CompressionLevel`] back to a [`u8`].
185    fn from(level: CompressionLevel) -> Self {
186        let inner: i32 = level.inner().into();
187        inner as u8
188    }
189}
190
191impl From<&CompressionLevel> for u8 {
192    /// Convenience method vor converting [`CompressionLevel`] back to a [`u8`].
193    fn from(level: &CompressionLevel) -> Self {
194        let inner: i32 = level.inner().into();
195        inner as u8
196    }
197}
198
199/// [`Compressor`] will BGZF compress a block of bytes with the [`Compressor::compress`] method, allowing for reuse of the compressor itself.
200///
201/// # Example
202///
203/// ```rust
204/// use bgzf::{Compressor, CompressionLevel};
205///
206/// let mut compressor = Compressor::new(2.try_into().unwrap());
207/// let input = &[b'A'; 100];
208/// let mut output_buffer = vec![];
209/// compressor.compress(input, &mut output_buffer).unwrap();
210/// assert!(input.len() > output_buffer.len());
211/// ```
212pub struct Compressor {
213    inner: libdeflater::Compressor,
214    level: CompressionLevel,
215}
216
217#[allow(dead_code)]
218impl Compressor {
219    /// Create a new [`Compressor`] with the given [`CompressionLevel`].
220    ///
221    /// # Example
222    ///
223    /// ```rust
224    /// use bgzf::Compressor;
225    /// let compressor = Compressor::new(3.try_into().expect("Invalid compression level"));
226    /// ```
227    #[must_use]
228    pub fn new(level: CompressionLevel) -> Self {
229        Self { inner: libdeflater::Compressor::new(*level.inner()), level }
230    }
231
232    #[inline]
233    fn inner(&self) -> &libdeflater::Compressor {
234        &self.inner
235    }
236
237    #[inline]
238    fn inner_mut(&mut self) -> &mut libdeflater::Compressor {
239        &mut self.inner
240    }
241
242    /// Compress a block of bytes, adding a header and footer.
243    #[inline(always)]
244    pub fn compress(&mut self, input: &[u8], buffer: &mut Vec<u8>) -> BgzfResult<()> {
245        // Use libdeflate's official bound calculation
246        let compress_bound = self.inner_mut().deflate_compress_bound(input.len());
247        let required_size = BGZF_HEADER_SIZE + compress_bound + BGZF_FOOTER_SIZE;
248
249        // SAFETY: All bytes in 0..final_len are written before the function returns:
250        // - bytes 0..18: header via copy_from_slice
251        // - bytes 18..18+bytes_written: written by deflate_compress
252        // - bytes footer_offset..footer_offset+8: footer via copy_from_slice
253        // - buffer is truncated to final_len, removing any uninitialized trailing bytes
254        #[allow(unsafe_code)]
255        unsafe {
256            buffer_ops::resize_uninit(buffer, required_size);
257        }
258
259        let bytes_written = self
260            .inner_mut()
261            .deflate_compress(input, &mut buffer[BGZF_HEADER_SIZE..])
262            .map_err(BgzfError::LibDeflaterCompress)?;
263
264        if bytes_written >= MAX_BGZF_BLOCK_SIZE {
265            return Err(BgzfError::BlockSizeExceeded(bytes_written, MAX_BGZF_BLOCK_SIZE));
266        }
267
268        // Compute CRC32
269        let mut crc = libdeflater::Crc::new();
270        crc.update(input);
271
272        // Write header
273        let header = header_inner(self.level, bytes_written as u16);
274        buffer[0..BGZF_HEADER_SIZE].copy_from_slice(&header);
275
276        // Write footer directly at computed offset
277        let footer_offset = BGZF_HEADER_SIZE + bytes_written;
278        buffer[footer_offset..footer_offset + BGZF_SIZEOF_CRC32]
279            .copy_from_slice(&crc.sum().to_le_bytes());
280        buffer[footer_offset + BGZF_SIZEOF_CRC32..footer_offset + BGZF_FOOTER_SIZE]
281            .copy_from_slice(&(input.len() as u32).to_le_bytes());
282
283        // Truncate to final size (removes uninitialized bytes beyond footer)
284        buffer.truncate(footer_offset + BGZF_FOOTER_SIZE);
285
286        Ok(())
287    }
288
289    /// Append the EOF block.
290    pub fn append_eof(bytes: &mut Vec<u8>) {
291        bytes.extend(BGZF_EOF);
292    }
293}
294
295/// [`Decompressor`] will decompress a BGZF block.
296struct Decompressor(libdeflater::Decompressor);
297
298#[allow(dead_code)]
299impl Decompressor {
300    /// Create a new [`Decompressor`].
301    fn new() -> Self {
302        Self(libdeflater::Decompressor::new())
303    }
304
305    #[inline]
306    fn inner(&self) -> &libdeflater::Decompressor {
307        &self.0
308    }
309
310    #[inline]
311    fn inner_mut(&mut self) -> &mut libdeflater::Decompressor {
312        &mut self.0
313    }
314
315    /// Decompress a block of bytes.
316    ///
317    /// This expects the `output` to be the exact size needed to hold the decompressed input.
318    /// This expects the input slice to have the header and footer values removed.
319    #[inline]
320    fn decompress(
321        &mut self,
322        input: &[u8],
323        output: &mut [u8],
324        checksum_values: ChecksumValues,
325    ) -> BgzfResult<()> {
326        if checksum_values.amount != 0 {
327            let _bytes_decompressed = self.inner_mut().deflate_decompress(input, output)?;
328        }
329        let mut new_check = libdeflater::Crc::new();
330        new_check.update(output);
331
332        if checksum_values.sum != new_check.sum() {
333            return Err(BgzfError::InvalidChecksum {
334                found: new_check.sum(),
335                expected: checksum_values.sum,
336            });
337        }
338        Ok(())
339    }
340}
341
342impl Default for Decompressor {
343    fn default() -> Self {
344        Self::new()
345    }
346}
347
348/// Create a BGZF header with the given compression level and compressed size.
349#[inline(always)]
350fn header_inner(
351    compression_level: CompressionLevel,
352    compressed_size: u16,
353) -> [u8; BGZF_HEADER_SIZE] {
354    let mut header = HEADER_TEMPLATE;
355
356    // Patch XFL (compression hint)
357    header[BGZF_XFL_OFFSET] = if compression_level.inner() >= &CompressionLvl::best() {
358        BGZF_COMPRESSION_HINT_BEST
359    } else if compression_level.inner() <= &CompressionLvl::fastest() {
360        BGZF_COMPRESSION_HINT_FASTEST
361    } else {
362        BGZF_COMPRESSION_HINT_OTHER
363    };
364
365    // Patch BSIZE (little-endian u16)
366    let bsize = compressed_size + BGZF_HEADER_SIZE as u16 + BGZF_FOOTER_SIZE as u16 - 1;
367    header[BGZF_BLOCK_SIZE_OFFSET..BGZF_BLOCK_SIZE_OFFSET + 2]
368        .copy_from_slice(&bsize.to_le_bytes());
369
370    header
371}
372
373/// Check that the header is as expected for this format
374#[inline]
375fn check_header(bytes: &[u8]) -> BgzfResult<()> {
376    // Check that the extra field flag is set
377    if bytes[3] & 4 != BGZF_NAME_COMMENT_EXTRA_FLAG {
378        Err(BgzfError::InvalidHeader("Extra field flag not set"))
379    } else if bytes[12] != BGZF_SUBFIELD_ID1 || bytes[13] != BGZF_SUBFIELD_ID2 {
380        // Check for BC in SID
381        Err(BgzfError::InvalidHeader("Bad SID"))
382    } else {
383        Ok(())
384    }
385}
386
387/// Extract the block size from the header.
388#[inline]
389fn get_block_size(bytes: &[u8]) -> usize {
390    LittleEndian::read_u16(&bytes[BGZF_BLOCK_SIZE_OFFSET..]) as usize + 1
391}
392
393/// Get the expected uncompressed size and check sum from the footer
394#[inline]
395fn get_footer_values(input: &[u8]) -> ChecksumValues {
396    let check_sum = LittleEndian::read_u32(&input[input.len() - 8..input.len() - 4]);
397    let check_amount = LittleEndian::read_u32(&input[input.len() - 4..]);
398    ChecksumValues { sum: check_sum, amount: check_amount }
399}
400
401/// Strip the footer off of a compressed block.
402#[inline]
403fn strip_footer(input: &[u8]) -> &[u8] {
404    &input[..input.len() - BGZF_FOOTER_SIZE]
405}
406
407#[cfg(test)]
408mod test {
409    use std::io::{Read, Write};
410    use std::{
411        fs::File,
412        io::{BufReader, BufWriter},
413    };
414
415    use proptest::prelude::*;
416    use tempfile::tempdir;
417
418    use super::*;
419
420    /// Test that the EOF marker is written exactly once at the end of the output
421    /// when using finish().
422    #[test]
423    fn test_eof_marker_written_once_with_finish() {
424        // Test with data that doesn't fill a complete block
425        let mut output = Vec::new();
426        {
427            let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
428            writer.write_all(b"hello").unwrap();
429            writer.finish().unwrap();
430        }
431
432        // Verify EOF marker appears exactly once at the end
433        assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
434
435        // Count occurrences of EOF marker
436        let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
437        assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
438    }
439
440    /// Test that EOF marker is written exactly once when relying on Drop.
441    #[test]
442    fn test_eof_marker_written_once_on_drop() {
443        let mut output = Vec::new();
444        {
445            let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
446            writer.write_all(b"hello").unwrap();
447            // Don't call finish(), let Drop handle it
448        }
449
450        // Verify EOF marker appears exactly once at the end
451        assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
452
453        // Count occurrences of EOF marker
454        let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
455        assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
456    }
457
458    /// Test that EOF marker is written even when the buffer is empty.
459    #[test]
460    fn test_eof_marker_empty_write() {
461        let mut output = Vec::new();
462        {
463            let writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
464            // Don't write any data, just finish
465            writer.finish().unwrap();
466        }
467
468        // Should still have the EOF marker
469        assert!(
470            output.ends_with(BGZF_EOF),
471            "Output should end with BGZF_EOF marker even with no data written"
472        );
473        // With no data, output should be exactly the EOF marker
474        assert_eq!(output.as_slice(), BGZF_EOF);
475    }
476
477    /// Test that calling flush() multiple times doesn't write multiple EOF markers.
478    #[test]
479    fn test_multiple_flush_single_eof() {
480        let mut output = Vec::new();
481        {
482            let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
483            writer.write_all(b"hello").unwrap();
484            writer.flush().unwrap();
485            writer.write_all(b"world").unwrap();
486            writer.flush().unwrap();
487            writer.finish().unwrap();
488        }
489
490        // Verify EOF marker appears exactly once at the end
491        assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");
492
493        // Count occurrences of EOF marker
494        let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
495        assert_eq!(
496            eof_count, 1,
497            "EOF marker should appear exactly once even after multiple flush() calls"
498        );
499    }
500
501    #[test]
502    fn test_simple_bgzfsync() {
503        let dir = tempdir().unwrap();
504
505        // Define and write input bytes
506        let input = b"
507        This is a longer test than normal to come up with a bunch of text.
508        We'll read just a few lines at a time.
509        What if this is a longer string, does that then make
510        things fail?
511        ";
512
513        let orig_file = dir.path().join("orig.output.txt");
514        let mut orig_writer = BufWriter::new(File::create(&orig_file).unwrap());
515        orig_writer.write_all(input).unwrap();
516        drop(orig_writer);
517
518        // Create output file
519        let output_file = dir.path().join("output.txt");
520        let out_writer = BufWriter::new(File::create(&output_file).unwrap());
521
522        // Compress input to output
523        let mut bgzf = Writer::new(out_writer, CompressionLevel::new(3).unwrap());
524        bgzf.write_all(input).unwrap();
525        bgzf.finish().unwrap();
526
527        // Read output back in
528        let mut reader = BufReader::new(File::open(output_file).unwrap());
529        let mut result = vec![];
530        reader.read_to_end(&mut result).unwrap();
531
532        // Decompress it
533        let mut decoder = Reader::new(&result[..]);
534        let mut bytes = vec![];
535        decoder.read_to_end(&mut bytes).unwrap();
536
537        // Assert decompressed output is equal to input
538        assert_eq!(input.to_vec(), bytes);
539    }
540
541    const DICT_SIZE: usize = 32768;
542    proptest! {
543        #[test]
544        fn proptest_bgzf(
545            input in prop::collection::vec(0..u8::MAX, 1..(DICT_SIZE * 10)),
546            buf_size in DICT_SIZE..BGZF_BLOCK_SIZE,
547            write_size in 1..BGZF_BLOCK_SIZE * 4,
548            comp_level in 1..12_u8
549        ) {
550            let dir = tempdir().unwrap();
551
552            // Create output file
553            let output_file = dir.path().join("output.txt");
554            let out_writer = BufWriter::new(File::create(&output_file).unwrap());
555
556            // Compress input to output
557            let mut writer = Writer::with_capacity(out_writer, CompressionLevel::new(comp_level).unwrap(), buf_size);
558
559            for chunk in input.chunks(write_size) {
560                writer.write_all(chunk).unwrap();
561            }
562            writer.finish().unwrap();
563
564            // Read output back in
565            let mut reader = BufReader::new(File::open(output_file).unwrap());
566            let mut result = vec![];
567            reader.read_to_end(&mut result).unwrap();
568
569            // Decompress it
570            let mut gz = Reader::new(&result[..]);
571            let mut bytes = vec![];
572            gz.read_to_end(&mut bytes).unwrap();
573
574            // Assert decompressed output is equal to input
575            assert_eq!(input.clone(), bytes);
576        }
577    }
578}