bgzf/
lib.rs

1//! This library provides both high level readers and writers for the BGZF format as well as lower level
2//! compressor and decompressor functions.
3//!
4//! Bgzf is a multi-gzip format that adds an extra field to the header indicating how large the
5//! complete block (with header and footer) is.
6//!
7//! # Examples
8//!
9//! ```rust
10//! use bgzf::{Reader, Writer};
11//! use std::error::Error;
12//! use std::io;
13//!
14//! /// Contrived example that decompresses stdin and compresses to stdout.
15//! fn main() -> Result<(), Box<dyn Error>> {
16//!     let mut reader = Reader::new(io::stdin());
17//!     let mut writer = Writer::new(io::stdout(), 2.try_into()?);
18//!     let total_bytes = io::copy(&mut reader, &mut writer)?;
19//!     eprintln!("{} uncompressed bytes", total_bytes);
20//!     Ok(())
21//! }
22//! ```
23#![forbid(unsafe_code)]
24#![allow(clippy::must_use_candidate, clippy::missing_errors_doc, clippy::missing_panics_doc)]
25
26// Re-export the reader and writer to the same level.
27mod reader;
28mod writer;
29pub use reader::*;
30pub use writer::*;
31
32use std::io;
33
34use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
35use libdeflater::CompressionLvl;
36use thiserror::Error;
37
38/// The maximum uncompressed blocksize for BGZF compression (taken from bgzip), used for initializing blocks.
39pub const BGZF_BLOCK_SIZE: usize = 65280;
40
41/// 128 KB default buffer size, same as pigz.
42pub const BUFSIZE: usize = 128 * 1024;
43
44/// Default from bgzf: compress(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
45/// 65536 which is u16::MAX + 1
46pub(crate) const MAX_BGZF_BLOCK_SIZE: usize = 64 * 1024;
47
48pub(crate) static BGZF_EOF: &[u8] = &[
49    0x1f, 0x8b, // ID1, ID2
50    0x08, // CM = DEFLATE
51    0x04, // FLG = FEXTRA
52    0x00, 0x00, 0x00, 0x00, // MTIME = 0
53    0x00, // XFL = 0
54    0xff, // OS = 255 (unknown)
55    0x06, 0x00, // XLEN = 6
56    0x42, 0x43, // SI1, SI2
57    0x02, 0x00, // SLEN = 2
58    0x1b, 0x00, // BSIZE = 27
59    0x03, 0x00, // CDATA
60    0x00, 0x00, 0x00, 0x00, // CRC32 = 0x00000000
61    0x00, 0x00, 0x00, 0x00, // ISIZE = 0
62];
63
64pub(crate) const BGZF_HEADER_SIZE: usize = 18;
65pub(crate) const BGZF_FOOTER_SIZE: usize = 8;
66pub(crate) const BGZF_MAGIC_BYTE_A: u8 = 31;
67pub(crate) const BGZF_MAGIC_BYTE_B: u8 = 139;
68pub(crate) const BGZF_COMPRESSION_METHOD: u8 = 8;
69pub(crate) const BGZF_NAME_COMMENT_EXTRA_FLAG: u8 = 4;
70pub(crate) const BGZF_DEFAULT_MTIME: u32 = 0;
71pub(crate) const BGZF_DEFAULT_OS: u8 = 255;
72pub(crate) const BGZF_EXTRA_FLAG_LEN: u16 = 6;
73pub(crate) const BGZF_SUBFIELD_ID1: u8 = b'B';
74pub(crate) const BGZF_SUBFIELD_ID2: u8 = b'C';
75pub(crate) const BGZF_SUBFIELD_LEN: u16 = 2;
76pub(crate) const BGZF_BLOCK_SIZE_OFFSET: usize = 16;
77
78pub(crate) const BGZF_COMPRESSION_HINT_BEST: u8 = 2;
79pub(crate) const BGZF_COMPRESSION_HINT_FASTEST: u8 = 4;
80pub(crate) const BGZF_COMPRESSION_HINT_OTHER: u8 = 0;
81
82const EXTRA: f64 = 0.1;
83
84/// Add 10% of the size of the input data to the size of the output amount to account for
85/// compression levels that actually increase the output datasize for some inputs (i.e totally
86/// random input data).
87#[inline]
88fn extra_amount(input_len: usize) -> usize {
89    std::cmp::max(128, (input_len as f64 * EXTRA) as usize)
90}
91
92type BgzfResult<T> = Result<T, BgzfError>;
93
94#[non_exhaustive]
95#[derive(Error, Debug)]
96pub enum BgzfError {
97    #[error("Compressed block size ({0}) exceeds max allowed: ({1})")]
98    BlockSizeExceeded(usize, usize),
99    #[error("Invalid compression level: {0}")]
100    CompressionLevel(u8),
101    #[error(transparent)]
102    Io(#[from] io::Error),
103    #[error("Invalid checksum, found {found}, expected {expected}")]
104    InvalidChecksum { found: u32, expected: u32 },
105    #[error("Invalid block header: {0}")]
106    InvalidHeader(&'static str),
107    #[error("LibDeflater compression error: {0:?}")]
108    LibDeflaterCompress(libdeflater::CompressionError),
109    #[error(transparent)]
110    LibDelfaterDecompress(#[from] libdeflater::DecompressionError),
111}
112
113/// The expected checksum and number of bytes for decompressed data.
114#[derive(Debug, Copy, Clone)]
115struct ChecksumValues {
116    /// The check sum
117    sum: u32,
118    /// The number of bytes that went into the sum
119    amount: u32,
120}
121
122/// Level of compression to use for for the compressors.
123///
124/// Valid values are 1-12. See [libdeflater](https://github.com/ebiggers/libdeflate#compression-levels) documentation on levels.
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub struct CompressionLevel(CompressionLvl);
127
128#[allow(dead_code)]
129impl CompressionLevel {
130    /// Create a new [`CompressionLevel`] instance.
131    ///
132    /// Valid levels are 1-12.
133    #[allow(clippy::cast_lossless)]
134    pub fn new(level: u8) -> BgzfResult<Self> {
135        // libdeflater::CompressionLvlError contains no information
136        Ok(Self(
137            CompressionLvl::new(level as i32).map_err(|_e| BgzfError::CompressionLevel(level))?,
138        ))
139    }
140
141    /// Get the inner compression level
142    fn inner(&self) -> &libdeflater::CompressionLvl {
143        &self.0
144    }
145}
146
147impl TryFrom<u8> for CompressionLevel {
148    type Error = BgzfError;
149
150    /// Try to convert a `u8` to a compression level.
151    ///
152    /// # Example
153    /// ```rust
154    /// use bgzf::CompressionLevel;
155    ///
156    /// let level: CompressionLevel = 2.try_into().unwrap();
157    /// assert_eq!(level, CompressionLevel::new(2).unwrap());
158    /// ```
159    fn try_from(value: u8) -> Result<Self, Self::Error> {
160        Self::new(value)
161    }
162}
163
164impl From<CompressionLevel> for u8 {
165    /// Convenience method vor converting [`CompressionLevel`] back to a [`u8`].
166    fn from(level: CompressionLevel) -> Self {
167        let inner: i32 = level.inner().into();
168        inner as u8
169    }
170}
171
172impl From<&CompressionLevel> for u8 {
173    /// Convenience method vor converting [`CompressionLevel`] back to a [`u8`].
174    fn from(level: &CompressionLevel) -> Self {
175        let inner: i32 = level.inner().into();
176        inner as u8
177    }
178}
179
180/// [`Compressor`] will BGZF compress a block of bytes with the [`Compressor::compress`] method, allowing for reuse of the compressor itself.
181///
182/// # Example
183///
184/// ```rust
185/// use bgzf::{Compressor, CompressionLevel};
186///
187/// let mut compressor = Compressor::new(2.try_into().unwrap());
188/// let input = &[b'A'; 100];
189/// let mut output_buffer = vec![];
190/// compressor.compress(input, &mut output_buffer).unwrap();
191/// assert!(input.len() > output_buffer.len());
192/// ```
193pub struct Compressor {
194    inner: libdeflater::Compressor,
195    level: CompressionLevel,
196}
197
198#[allow(dead_code)]
199impl Compressor {
200    /// Create a new [`Compressor`] with the given [`CompressionLevel`].
201    ///
202    /// # Example
203    ///
204    /// ```rust
205    /// use bgzf::Compressor;
206    /// let compressor = Compressor::new(3.try_into().expect("Invalid compression level"));
207    /// ```
208    pub fn new(level: CompressionLevel) -> Self {
209        Self { inner: libdeflater::Compressor::new(*level.inner()), level }
210    }
211
212    #[inline]
213    fn inner(&self) -> &libdeflater::Compressor {
214        &self.inner
215    }
216
217    #[inline]
218    fn inner_mut(&mut self) -> &mut libdeflater::Compressor {
219        &mut self.inner
220    }
221
222    /// Compress a block of bytes, adding a header and footer.
223    #[inline]
224    pub fn compress(&mut self, input: &[u8], buffer: &mut Vec<u8>) -> BgzfResult<()> {
225        buffer.resize_with(
226            BGZF_HEADER_SIZE + input.len() + extra_amount(input.len()) + BGZF_FOOTER_SIZE,
227            || 0,
228        );
229
230        let bytes_written = self
231            .inner_mut()
232            .deflate_compress(input, &mut buffer[BGZF_HEADER_SIZE..])
233            .map_err(BgzfError::LibDeflaterCompress)?;
234
235        // Make sure that compressed buffer is smaller than
236        if bytes_written >= MAX_BGZF_BLOCK_SIZE {
237            return Err(BgzfError::BlockSizeExceeded(bytes_written, MAX_BGZF_BLOCK_SIZE));
238        }
239        let mut check = libdeflater::Crc::new();
240        check.update(input);
241
242        // Add header with total byte sizes
243        let header = header_inner(self.level, bytes_written as u16);
244        buffer[0..BGZF_HEADER_SIZE].copy_from_slice(&header);
245        buffer.truncate(BGZF_HEADER_SIZE + bytes_written);
246
247        buffer.write_u32::<LittleEndian>(check.sum())?;
248        buffer.write_u32::<LittleEndian>(input.len() as u32)?;
249
250        Ok(())
251    }
252
253    /// Append the EOF block.
254    pub fn append_eof(bytes: &mut Vec<u8>) {
255        bytes.extend(BGZF_EOF);
256    }
257}
258
259/// [`Decompressor`] will decompress a BGZF block.
260struct Decompressor(libdeflater::Decompressor);
261
262#[allow(dead_code)]
263impl Decompressor {
264    /// Create a new [`Decompressor`].
265    fn new() -> Self {
266        Self(libdeflater::Decompressor::new())
267    }
268
269    #[inline]
270    fn inner(&self) -> &libdeflater::Decompressor {
271        &self.0
272    }
273
274    #[inline]
275    fn inner_mut(&mut self) -> &mut libdeflater::Decompressor {
276        &mut self.0
277    }
278
279    /// Decompress a block of bytes.
280    ///
281    /// This expects the `output` to be the exact size needed to hold the decompressed input.
282    /// This expects the input slice to have the header and footer values removed.
283    #[inline]
284    fn decompress(
285        &mut self,
286        input: &[u8],
287        output: &mut [u8],
288        checksum_values: ChecksumValues,
289    ) -> BgzfResult<()> {
290        if checksum_values.amount != 0 {
291            let _bytes_decompressed = self.inner_mut().deflate_decompress(input, output)?;
292        }
293        let mut new_check = libdeflater::Crc::new();
294        new_check.update(output);
295
296        if checksum_values.sum != new_check.sum() {
297            return Err(BgzfError::InvalidChecksum {
298                found: new_check.sum(),
299                expected: checksum_values.sum,
300            });
301        }
302        Ok(())
303    }
304}
305
306impl Default for Decompressor {
307    fn default() -> Self {
308        Self::new()
309    }
310}
311
312/// Create an Bgzf style header.
313#[inline]
314fn header_inner(compression_level: CompressionLevel, compressed_size: u16) -> Vec<u8> {
315    // Determine hint to place in header
316    // From https://github.com/rust-lang/flate2-rs/blob/b2e976da21c18c8f31132e93a7f803b5e32f2b6d/src/gz/mod.rs#L235
317    let comp_value = if compression_level.inner() >= &CompressionLvl::best() {
318        BGZF_COMPRESSION_HINT_BEST
319    } else if compression_level.inner() <= &CompressionLvl::fastest() {
320        BGZF_COMPRESSION_HINT_FASTEST
321    } else {
322        BGZF_COMPRESSION_HINT_OTHER
323    };
324
325    let mut header: Vec<u8> = Vec::with_capacity(20);
326    header.write_u8(BGZF_MAGIC_BYTE_A).unwrap(); // magic byte
327    header.write_u8(BGZF_MAGIC_BYTE_B).unwrap(); // magic byte
328    header.write_u8(BGZF_COMPRESSION_METHOD).unwrap(); // compression method
329    header.write_u8(BGZF_NAME_COMMENT_EXTRA_FLAG).unwrap(); // name / comment / extraflag
330    header.write_u32::<LittleEndian>(BGZF_DEFAULT_MTIME).unwrap(); // mtime
331    header.write_u8(comp_value).unwrap(); // compression value
332    header.write_u8(BGZF_DEFAULT_OS).unwrap(); // OS
333    header.write_u16::<LittleEndian>(BGZF_EXTRA_FLAG_LEN).unwrap(); // Extra flag len
334    header.write_u8(BGZF_SUBFIELD_ID1).unwrap(); // Bgzf subfield ID 1
335    header.write_u8(BGZF_SUBFIELD_ID2).unwrap(); // Bgzf subfield ID2
336    header.write_u16::<LittleEndian>(BGZF_SUBFIELD_LEN).unwrap(); // Bgzf subfield len
337    header
338        .write_u16::<LittleEndian>(
339            compressed_size + BGZF_HEADER_SIZE as u16 + BGZF_FOOTER_SIZE as u16 - 1,
340        )
341        .unwrap(); // Size of block including header and footer - 1 BLEN
342
343    header
344}
345
346/// Check that the header is as expected for this format
347#[inline]
348fn check_header(bytes: &[u8]) -> BgzfResult<()> {
349    // Check that the extra field flag is set
350    if bytes[3] & 4 != BGZF_NAME_COMMENT_EXTRA_FLAG {
351        Err(BgzfError::InvalidHeader("Extra field flag not set"))
352    } else if bytes[12] != BGZF_SUBFIELD_ID1 || bytes[13] != BGZF_SUBFIELD_ID2 {
353        // Check for BC in SID
354        Err(BgzfError::InvalidHeader("Bad SID"))
355    } else {
356        Ok(())
357    }
358}
359
360/// Extract the block size from the header.
361#[inline]
362fn get_block_size(bytes: &[u8]) -> usize {
363    LittleEndian::read_u16(&bytes[BGZF_BLOCK_SIZE_OFFSET..]) as usize + 1
364}
365
366/// Get the expected uncompressed size and check sum from the footer
367#[inline]
368fn get_footer_values(input: &[u8]) -> ChecksumValues {
369    let check_sum = LittleEndian::read_u32(&input[input.len() - 8..input.len() - 4]);
370    let check_amount = LittleEndian::read_u32(&input[input.len() - 4..]);
371    ChecksumValues { sum: check_sum, amount: check_amount }
372}
373
374/// Strip the footer off of a compressed block.
375#[inline]
376fn strip_footer(input: &[u8]) -> &[u8] {
377    &input[..input.len() - BGZF_FOOTER_SIZE]
378}
379
380#[cfg(test)]
381mod test {
382    use std::io::{Read, Write};
383    use std::{
384        fs::File,
385        io::{BufReader, BufWriter},
386    };
387
388    use proptest::prelude::*;
389    use tempfile::tempdir;
390
391    use super::*;
392
393    #[test]
394    fn test_simple_bgzfsync() {
395        let dir = tempdir().unwrap();
396
397        // Define and write input bytes
398        let input = b"
399        This is a longer test than normal to come up with a bunch of text.
400        We'll read just a few lines at a time.
401        What if this is a longer string, does that then make
402        things fail?
403        ";
404
405        let orig_file = dir.path().join("orig.output.txt");
406        let mut orig_writer = BufWriter::new(File::create(&orig_file).unwrap());
407        orig_writer.write_all(input).unwrap();
408        drop(orig_writer);
409
410        // Create output file
411        let output_file = dir.path().join("output.txt");
412        let out_writer = BufWriter::new(File::create(&output_file).unwrap());
413
414        // Compress input to output
415        let mut bgzf = Writer::new(out_writer, CompressionLevel::new(3).unwrap());
416        bgzf.write_all(input).unwrap();
417        bgzf.flush().unwrap();
418        drop(bgzf);
419
420        // Read output back in
421        let mut reader = BufReader::new(File::open(output_file).unwrap());
422        let mut result = vec![];
423        reader.read_to_end(&mut result).unwrap();
424
425        // Decompress it
426        let mut decoder = Reader::new(&result[..]);
427        let mut bytes = vec![];
428        decoder.read_to_end(&mut bytes).unwrap();
429
430        // Assert decompressed output is equal to input
431        assert_eq!(input.to_vec(), bytes);
432    }
433
434    const DICT_SIZE: usize = 32768;
435    proptest! {
436        #[test]
437        fn proptest_bgzf(
438            input in prop::collection::vec(0..u8::MAX, 1..(DICT_SIZE * 10)),
439            buf_size in DICT_SIZE..BGZF_BLOCK_SIZE,
440            write_size in 1..BGZF_BLOCK_SIZE * 4,
441            comp_level in 1..12_u8
442        ) {
443            let dir = tempdir().unwrap();
444
445            // Create output file
446            let output_file = dir.path().join("output.txt");
447            let out_writer = BufWriter::new(File::create(&output_file).unwrap());
448
449            // Compress input to output
450            let mut writer = Writer::with_capacity(out_writer, CompressionLevel::new(comp_level).unwrap(), buf_size);
451
452            for chunk in input.chunks(write_size) {
453                writer.write_all(chunk).unwrap();
454            }
455            writer.flush().unwrap();
456            drop(writer);
457
458            // Read output back in
459            let mut reader = BufReader::new(File::open(output_file).unwrap());
460            let mut result = vec![];
461            reader.read_to_end(&mut result).unwrap();
462
463            // Decompress it
464            let mut gz = Reader::new(&result[..]);
465            let mut bytes = vec![];
466            gz.read_to_end(&mut bytes).unwrap();
467
468            // Assert decompressed output is equal to input
469            assert_eq!(input.clone(), bytes);
470        }
471    }
472}