bgzf 0.3.0

Utility library for working with explicitly BGZF compressed data
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
//! This library provides both high level readers and writers for the BGZF format as well as lower level
//! compressor and decompressor functions.
//!
//! Bgzf is a multi-gzip format that adds an extra field to the header indicating how large the
//! complete block (with header and footer) is.
//!
//! # Examples
//!
//! ```rust
//! use bgzf::{Reader, Writer};
//! use std::error::Error;
//! use std::io;
//!
//! /// Contrived example that decompresses stdin and compresses to stdout.
//! fn main() -> Result<(), Box<dyn Error>> {
//!     let mut reader = Reader::new(io::stdin());
//!     let mut writer = Writer::new(io::stdout(), 2.try_into()?);
//!     let total_bytes = io::copy(&mut reader, &mut writer)?;
//!     eprintln!("{} uncompressed bytes", total_bytes);
//!     Ok(())
//! }
//! ```
#![deny(unsafe_code)]
#![allow(clippy::must_use_candidate, clippy::missing_errors_doc, clippy::missing_panics_doc)]

// Re-export the reader and writer to the same level.
mod reader;
mod writer;
pub use reader::*;
pub use writer::*;

use std::io;

use byteorder::{ByteOrder, LittleEndian};
use libdeflater::CompressionLvl;
use thiserror::Error;

/// Buffer operations that avoid unnecessary memory initialization.
mod buffer_ops {
    /// Resizes a buffer to `new_len` without initializing the new bytes.
    ///
    /// # Safety
    ///
    /// The caller must ensure that all bytes in `0..new_len` are written
    /// before any of them are read. This is safe because:
    /// - `u8` has no invalid bit patterns
    /// - `reserve_exact()` ensures sufficient capacity
    /// - The buffer is cleared first, so no stale data remains
    #[inline(always)]
    #[allow(unsafe_code, clippy::uninit_vec)]
    pub(crate) unsafe fn resize_uninit(buffer: &mut Vec<u8>, new_len: usize) {
        buffer.clear();
        buffer.reserve_exact(new_len);
        buffer.set_len(new_len);
    }
}

/// The maximum uncompressed blocksize for BGZF compression (taken from bgzip), used for initializing blocks.
pub const BGZF_BLOCK_SIZE: usize = 65280;

/// 128 KB default buffer size, same as pigz.
pub const BUFSIZE: usize = 128 * 1024;

/// Default from bgzf: compress(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
/// 65536 which is u16::MAX + 1
pub(crate) const MAX_BGZF_BLOCK_SIZE: usize = 64 * 1024;

pub(crate) static BGZF_EOF: &[u8] = &[
    0x1f, 0x8b, // ID1, ID2
    0x08, // CM = DEFLATE
    0x04, // FLG = FEXTRA
    0x00, 0x00, 0x00, 0x00, // MTIME = 0
    0x00, // XFL = 0
    0xff, // OS = 255 (unknown)
    0x06, 0x00, // XLEN = 6
    0x42, 0x43, // SI1, SI2
    0x02, 0x00, // SLEN = 2
    0x1b, 0x00, // BSIZE = 27
    0x03, 0x00, // CDATA
    0x00, 0x00, 0x00, 0x00, // CRC32 = 0x00000000
    0x00, 0x00, 0x00, 0x00, // ISIZE = 0
];

pub(crate) const BGZF_HEADER_SIZE: usize = 18;
pub(crate) const BGZF_FOOTER_SIZE: usize = 8;
pub(crate) const BGZF_SIZEOF_CRC32: usize = 4;
pub(crate) const BGZF_NAME_COMMENT_EXTRA_FLAG: u8 = 4;
pub(crate) const BGZF_SUBFIELD_ID1: u8 = b'B';
pub(crate) const BGZF_SUBFIELD_ID2: u8 = b'C';
pub(crate) const BGZF_BLOCK_SIZE_OFFSET: usize = 16;
pub(crate) const BGZF_XFL_OFFSET: usize = 8;

pub(crate) const BGZF_COMPRESSION_HINT_BEST: u8 = 2;
pub(crate) const BGZF_COMPRESSION_HINT_FASTEST: u8 = 4;
pub(crate) const BGZF_COMPRESSION_HINT_OTHER: u8 = 0;

/// Pre-computed BGZF header template. Only bytes 8 (XFL) and 16-17 (BSIZE) vary.
const HEADER_TEMPLATE: [u8; BGZF_HEADER_SIZE] = [
    0x1f, 0x8b, // ID1, ID2 (magic)
    0x08, // CM = DEFLATE
    0x04, // FLG = FEXTRA
    0x00, 0x00, 0x00, 0x00, // MTIME = 0
    0x00, // XFL = placeholder (byte 8)
    0xff, // OS = 255
    0x06, 0x00, // XLEN = 6
    b'B', b'C', // SI1, SI2
    0x02, 0x00, // SLEN = 2
    0x00, 0x00, // BSIZE placeholder (bytes 16-17)
];

type BgzfResult<T> = Result<T, BgzfError>;

#[non_exhaustive]
#[derive(Error, Debug)]
pub enum BgzfError {
    #[error("Compressed block size ({0}) exceeds max allowed: ({1})")]
    BlockSizeExceeded(usize, usize),
    #[error("Invalid compression level: {0}")]
    CompressionLevel(u8),
    #[error(transparent)]
    Io(#[from] io::Error),
    #[error("Invalid checksum, found {found}, expected {expected}")]
    InvalidChecksum { found: u32, expected: u32 },
    #[error("Invalid block header: {0}")]
    InvalidHeader(&'static str),
    #[error("LibDeflater compression error: {0:?}")]
    LibDeflaterCompress(libdeflater::CompressionError),
    #[error(transparent)]
    LibDelfaterDecompress(#[from] libdeflater::DecompressionError),
}

/// The expected checksum and number of bytes for decompressed data.
#[derive(Debug, Copy, Clone)]
struct ChecksumValues {
    /// The check sum
    sum: u32,
    /// The number of bytes that went into the sum
    amount: u32,
}

/// Level of compression to use for for the compressors.
///
/// Valid values are 1-12. See [libdeflater](https://github.com/ebiggers/libdeflate#compression-levels) documentation on levels.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct CompressionLevel(CompressionLvl);

#[allow(dead_code)]
impl CompressionLevel {
    /// Create a new [`CompressionLevel`] instance.
    ///
    /// Valid levels are 1-12.
    #[allow(clippy::cast_lossless)]
    pub fn new(level: u8) -> BgzfResult<Self> {
        // libdeflater::CompressionLvlError contains no information
        Ok(Self(
            CompressionLvl::new(level as i32).map_err(|_e| BgzfError::CompressionLevel(level))?,
        ))
    }

    /// Get the inner compression level
    fn inner(&self) -> &libdeflater::CompressionLvl {
        &self.0
    }
}

impl TryFrom<u8> for CompressionLevel {
    type Error = BgzfError;

    /// Try to convert a `u8` to a compression level.
    ///
    /// # Example
    /// ```rust
    /// use bgzf::CompressionLevel;
    ///
    /// let level: CompressionLevel = 2.try_into().unwrap();
    /// assert_eq!(level, CompressionLevel::new(2).unwrap());
    /// ```
    fn try_from(value: u8) -> Result<Self, Self::Error> {
        Self::new(value)
    }
}

impl From<CompressionLevel> for u8 {
    /// Convenience method vor converting [`CompressionLevel`] back to a [`u8`].
    fn from(level: CompressionLevel) -> Self {
        let inner: i32 = level.inner().into();
        inner as u8
    }
}

impl From<&CompressionLevel> for u8 {
    /// Convenience method vor converting [`CompressionLevel`] back to a [`u8`].
    fn from(level: &CompressionLevel) -> Self {
        let inner: i32 = level.inner().into();
        inner as u8
    }
}

/// [`Compressor`] will BGZF compress a block of bytes with the [`Compressor::compress`] method, allowing for reuse of the compressor itself.
///
/// # Example
///
/// ```rust
/// use bgzf::{Compressor, CompressionLevel};
///
/// let mut compressor = Compressor::new(2.try_into().unwrap());
/// let input = &[b'A'; 100];
/// let mut output_buffer = vec![];
/// compressor.compress(input, &mut output_buffer).unwrap();
/// assert!(input.len() > output_buffer.len());
/// ```
pub struct Compressor {
    inner: libdeflater::Compressor,
    level: CompressionLevel,
}

#[allow(dead_code)]
impl Compressor {
    /// Create a new [`Compressor`] with the given [`CompressionLevel`].
    ///
    /// # Example
    ///
    /// ```rust
    /// use bgzf::Compressor;
    /// let compressor = Compressor::new(3.try_into().expect("Invalid compression level"));
    /// ```
    #[must_use]
    pub fn new(level: CompressionLevel) -> Self {
        Self { inner: libdeflater::Compressor::new(*level.inner()), level }
    }

    #[inline]
    fn inner(&self) -> &libdeflater::Compressor {
        &self.inner
    }

    #[inline]
    fn inner_mut(&mut self) -> &mut libdeflater::Compressor {
        &mut self.inner
    }

    /// Compress a block of bytes, adding a header and footer.
    #[inline(always)]
    pub fn compress(&mut self, input: &[u8], buffer: &mut Vec<u8>) -> BgzfResult<()> {
        // Use libdeflate's official bound calculation
        let compress_bound = self.inner_mut().deflate_compress_bound(input.len());
        let required_size = BGZF_HEADER_SIZE + compress_bound + BGZF_FOOTER_SIZE;

        // SAFETY: All bytes in 0..final_len are written before the function returns:
        // - bytes 0..18: header via copy_from_slice
        // - bytes 18..18+bytes_written: written by deflate_compress
        // - bytes footer_offset..footer_offset+8: footer via copy_from_slice
        // - buffer is truncated to final_len, removing any uninitialized trailing bytes
        #[allow(unsafe_code)]
        unsafe {
            buffer_ops::resize_uninit(buffer, required_size);
        }

        let bytes_written = self
            .inner_mut()
            .deflate_compress(input, &mut buffer[BGZF_HEADER_SIZE..])
            .map_err(BgzfError::LibDeflaterCompress)?;

        if bytes_written >= MAX_BGZF_BLOCK_SIZE {
            return Err(BgzfError::BlockSizeExceeded(bytes_written, MAX_BGZF_BLOCK_SIZE));
        }

        // Compute CRC32
        let mut crc = libdeflater::Crc::new();
        crc.update(input);

        // Write header
        let header = header_inner(self.level, bytes_written as u16);
        buffer[0..BGZF_HEADER_SIZE].copy_from_slice(&header);

        // Write footer directly at computed offset
        let footer_offset = BGZF_HEADER_SIZE + bytes_written;
        buffer[footer_offset..footer_offset + BGZF_SIZEOF_CRC32]
            .copy_from_slice(&crc.sum().to_le_bytes());
        buffer[footer_offset + BGZF_SIZEOF_CRC32..footer_offset + BGZF_FOOTER_SIZE]
            .copy_from_slice(&(input.len() as u32).to_le_bytes());

        // Truncate to final size (removes uninitialized bytes beyond footer)
        buffer.truncate(footer_offset + BGZF_FOOTER_SIZE);

        Ok(())
    }

    /// Append the EOF block.
    pub fn append_eof(bytes: &mut Vec<u8>) {
        bytes.extend(BGZF_EOF);
    }
}

/// [`Decompressor`] will decompress a BGZF block.
struct Decompressor(libdeflater::Decompressor);

#[allow(dead_code)]
impl Decompressor {
    /// Create a new [`Decompressor`].
    fn new() -> Self {
        Self(libdeflater::Decompressor::new())
    }

    #[inline]
    fn inner(&self) -> &libdeflater::Decompressor {
        &self.0
    }

    #[inline]
    fn inner_mut(&mut self) -> &mut libdeflater::Decompressor {
        &mut self.0
    }

    /// Decompress a block of bytes.
    ///
    /// This expects the `output` to be the exact size needed to hold the decompressed input.
    /// This expects the input slice to have the header and footer values removed.
    #[inline]
    fn decompress(
        &mut self,
        input: &[u8],
        output: &mut [u8],
        checksum_values: ChecksumValues,
    ) -> BgzfResult<()> {
        if checksum_values.amount != 0 {
            let _bytes_decompressed = self.inner_mut().deflate_decompress(input, output)?;
        }
        let mut new_check = libdeflater::Crc::new();
        new_check.update(output);

        if checksum_values.sum != new_check.sum() {
            return Err(BgzfError::InvalidChecksum {
                found: new_check.sum(),
                expected: checksum_values.sum,
            });
        }
        Ok(())
    }
}

impl Default for Decompressor {
    fn default() -> Self {
        Self::new()
    }
}

/// Create a BGZF header with the given compression level and compressed size.
#[inline(always)]
fn header_inner(
    compression_level: CompressionLevel,
    compressed_size: u16,
) -> [u8; BGZF_HEADER_SIZE] {
    let mut header = HEADER_TEMPLATE;

    // Patch XFL (compression hint)
    header[BGZF_XFL_OFFSET] = if compression_level.inner() >= &CompressionLvl::best() {
        BGZF_COMPRESSION_HINT_BEST
    } else if compression_level.inner() <= &CompressionLvl::fastest() {
        BGZF_COMPRESSION_HINT_FASTEST
    } else {
        BGZF_COMPRESSION_HINT_OTHER
    };

    // Patch BSIZE (little-endian u16)
    let bsize = compressed_size + BGZF_HEADER_SIZE as u16 + BGZF_FOOTER_SIZE as u16 - 1;
    header[BGZF_BLOCK_SIZE_OFFSET..BGZF_BLOCK_SIZE_OFFSET + 2]
        .copy_from_slice(&bsize.to_le_bytes());

    header
}

/// Check that the header is as expected for this format
#[inline]
fn check_header(bytes: &[u8]) -> BgzfResult<()> {
    // Check that the extra field flag is set
    if bytes[3] & 4 != BGZF_NAME_COMMENT_EXTRA_FLAG {
        Err(BgzfError::InvalidHeader("Extra field flag not set"))
    } else if bytes[12] != BGZF_SUBFIELD_ID1 || bytes[13] != BGZF_SUBFIELD_ID2 {
        // Check for BC in SID
        Err(BgzfError::InvalidHeader("Bad SID"))
    } else {
        Ok(())
    }
}

/// Extract the block size from the header.
#[inline]
fn get_block_size(bytes: &[u8]) -> usize {
    LittleEndian::read_u16(&bytes[BGZF_BLOCK_SIZE_OFFSET..]) as usize + 1
}

/// Get the expected uncompressed size and check sum from the footer
#[inline]
fn get_footer_values(input: &[u8]) -> ChecksumValues {
    let check_sum = LittleEndian::read_u32(&input[input.len() - 8..input.len() - 4]);
    let check_amount = LittleEndian::read_u32(&input[input.len() - 4..]);
    ChecksumValues { sum: check_sum, amount: check_amount }
}

/// Strip the footer off of a compressed block.
#[inline]
fn strip_footer(input: &[u8]) -> &[u8] {
    &input[..input.len() - BGZF_FOOTER_SIZE]
}

#[cfg(test)]
mod test {
    use std::io::{Read, Write};
    use std::{
        fs::File,
        io::{BufReader, BufWriter},
    };

    use proptest::prelude::*;
    use tempfile::tempdir;

    use super::*;

    /// Test that the EOF marker is written exactly once at the end of the output
    /// when using finish().
    #[test]
    fn test_eof_marker_written_once_with_finish() {
        // Test with data that doesn't fill a complete block
        let mut output = Vec::new();
        {
            let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
            writer.write_all(b"hello").unwrap();
            writer.finish().unwrap();
        }

        // Verify EOF marker appears exactly once at the end
        assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");

        // Count occurrences of EOF marker
        let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
        assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
    }

    /// Test that EOF marker is written exactly once when relying on Drop.
    #[test]
    fn test_eof_marker_written_once_on_drop() {
        let mut output = Vec::new();
        {
            let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
            writer.write_all(b"hello").unwrap();
            // Don't call finish(), let Drop handle it
        }

        // Verify EOF marker appears exactly once at the end
        assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");

        // Count occurrences of EOF marker
        let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
        assert_eq!(eof_count, 1, "EOF marker should appear exactly once");
    }

    /// Test that EOF marker is written even when the buffer is empty.
    #[test]
    fn test_eof_marker_empty_write() {
        let mut output = Vec::new();
        {
            let writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
            // Don't write any data, just finish
            writer.finish().unwrap();
        }

        // Should still have the EOF marker
        assert!(
            output.ends_with(BGZF_EOF),
            "Output should end with BGZF_EOF marker even with no data written"
        );
        // With no data, output should be exactly the EOF marker
        assert_eq!(output.as_slice(), BGZF_EOF);
    }

    /// Test that calling flush() multiple times doesn't write multiple EOF markers.
    #[test]
    fn test_multiple_flush_single_eof() {
        let mut output = Vec::new();
        {
            let mut writer = Writer::new(&mut output, CompressionLevel::new(3).unwrap());
            writer.write_all(b"hello").unwrap();
            writer.flush().unwrap();
            writer.write_all(b"world").unwrap();
            writer.flush().unwrap();
            writer.finish().unwrap();
        }

        // Verify EOF marker appears exactly once at the end
        assert!(output.ends_with(BGZF_EOF), "Output should end with BGZF_EOF marker");

        // Count occurrences of EOF marker
        let eof_count = output.windows(BGZF_EOF.len()).filter(|w| *w == BGZF_EOF).count();
        assert_eq!(
            eof_count, 1,
            "EOF marker should appear exactly once even after multiple flush() calls"
        );
    }

    #[test]
    fn test_simple_bgzfsync() {
        let dir = tempdir().unwrap();

        // Define and write input bytes
        let input = b"
        This is a longer test than normal to come up with a bunch of text.
        We'll read just a few lines at a time.
        What if this is a longer string, does that then make
        things fail?
        ";

        let orig_file = dir.path().join("orig.output.txt");
        let mut orig_writer = BufWriter::new(File::create(&orig_file).unwrap());
        orig_writer.write_all(input).unwrap();
        drop(orig_writer);

        // Create output file
        let output_file = dir.path().join("output.txt");
        let out_writer = BufWriter::new(File::create(&output_file).unwrap());

        // Compress input to output
        let mut bgzf = Writer::new(out_writer, CompressionLevel::new(3).unwrap());
        bgzf.write_all(input).unwrap();
        bgzf.finish().unwrap();

        // Read output back in
        let mut reader = BufReader::new(File::open(output_file).unwrap());
        let mut result = vec![];
        reader.read_to_end(&mut result).unwrap();

        // Decompress it
        let mut decoder = Reader::new(&result[..]);
        let mut bytes = vec![];
        decoder.read_to_end(&mut bytes).unwrap();

        // Assert decompressed output is equal to input
        assert_eq!(input.to_vec(), bytes);
    }

    const DICT_SIZE: usize = 32768;
    proptest! {
        #[test]
        fn proptest_bgzf(
            input in prop::collection::vec(0..u8::MAX, 1..(DICT_SIZE * 10)),
            buf_size in DICT_SIZE..BGZF_BLOCK_SIZE,
            write_size in 1..BGZF_BLOCK_SIZE * 4,
            comp_level in 1..12_u8
        ) {
            let dir = tempdir().unwrap();

            // Create output file
            let output_file = dir.path().join("output.txt");
            let out_writer = BufWriter::new(File::create(&output_file).unwrap());

            // Compress input to output
            let mut writer = Writer::with_capacity(out_writer, CompressionLevel::new(comp_level).unwrap(), buf_size);

            for chunk in input.chunks(write_size) {
                writer.write_all(chunk).unwrap();
            }
            writer.finish().unwrap();

            // Read output back in
            let mut reader = BufReader::new(File::open(output_file).unwrap());
            let mut result = vec![];
            reader.read_to_end(&mut result).unwrap();

            // Decompress it
            let mut gz = Reader::new(&result[..]);
            let mut bytes = vec![];
            gz.read_to_end(&mut bytes).unwrap();

            // Assert decompressed output is equal to input
            assert_eq!(input.clone(), bytes);
        }
    }
}