Skip to main content

rsomics_bam_quickcheck/
lib.rs

1//! Quickly validate a BAM file — port of `samtools quickcheck`.
2//!
3//! Checks performed (matches samtools 1.23.1 behavior):
4//! 1. The file begins with a valid BGZF block magic (`1f 8b 08 04`).
5//! 2. The first BGZF block decompresses and starts with the BAM header magic
6//!    `BAM\1` (`42 41 4d 01`).
7//! 3. The file ends with the 28-byte BGZF empty-block EOF marker
8//!    (`1f 8b 08 04 00 00 00 00 00 ff 06 00 42 43 02 00 1b 00 03 00 00 00 00 00 00 00 00 00`)
9//!    — unless `--no-eof` is set.
10//!
11//! Exit code 0 = all input files valid; non-zero = at least one invalid.
12//!
13//! ## Origin
14//! Independent Rust reimplementation of `samtools quickcheck`. samtools is
15//! MIT-licensed; the precise BGZF EOF marker bytes and the "BAM\1" header magic
16//! check were determined from samtools' MIT-licensed source plus the SAM/BAM
17//! and BGZF specifications.
18//!
19//! License: MIT OR Apache-2.0. Upstream credit: samtools (MIT).
20
21use std::fs::File;
22use std::io::{Read, Seek, SeekFrom};
23use std::path::Path;
24
25use flate2::read::MultiGzDecoder;
26
27/// The 28-byte BGZF EOF (empty-block) marker per the SAM/BAM specification.
28pub const BGZF_EOF: [u8; 28] = [
29    0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00,
30    0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
31];
32
33#[derive(Debug, Default, Clone, Copy)]
34pub struct QuickcheckOpts {
35    /// Skip the BGZF EOF check (the `-u` / `--no-eof` flag).
36    pub no_eof: bool,
37}
38
39#[derive(Debug)]
40pub enum QuickcheckError {
41    Io(std::io::Error),
42    BadBgzfMagic,
43    BadBamMagic,
44    MissingEof,
45    Truncated,
46}
47
48impl std::fmt::Display for QuickcheckError {
49    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
50        match self {
51            Self::Io(e) => write!(f, "io: {e}"),
52            Self::BadBgzfMagic => f.write_str("bad BGZF magic (not a BGZF/BAM file)"),
53            Self::BadBamMagic => f.write_str("bad BAM header magic"),
54            Self::MissingEof => f.write_str("missing BGZF EOF marker"),
55            Self::Truncated => f.write_str("file too short to be a valid BAM"),
56        }
57    }
58}
59
60impl std::error::Error for QuickcheckError {}
61
62impl From<std::io::Error> for QuickcheckError {
63    fn from(e: std::io::Error) -> Self {
64        Self::Io(e)
65    }
66}
67
68/// Run the quickcheck on a single BAM. Returns `Ok(())` on success.
69pub fn quickcheck(path: &Path, opts: &QuickcheckOpts) -> Result<(), QuickcheckError> {
70    let mut f = File::open(path)?;
71    let meta = f.metadata()?;
72    let len = meta.len();
73
74    if len < 28 {
75        return Err(QuickcheckError::Truncated);
76    }
77
78    // 1) BGZF magic at offset 0.
79    let mut head = [0u8; 4];
80    f.read_exact(&mut head)?;
81    if head != [0x1f, 0x8b, 0x08, 0x04] {
82        return Err(QuickcheckError::BadBgzfMagic);
83    }
84    // Rewind for BAM magic check.
85    f.seek(SeekFrom::Start(0))?;
86
87    // 2) Decompress just enough to read the BAM header magic ("BAM\1").
88    // BGZF is multi-member gzip; MultiGzDecoder concatenates blocks transparently.
89    let mut bam_magic = [0u8; 4];
90    MultiGzDecoder::new(&mut f).read_exact(&mut bam_magic)?;
91    if bam_magic != *b"BAM\x01" {
92        return Err(QuickcheckError::BadBamMagic);
93    }
94
95    // 3) BGZF EOF marker — last 28 bytes.
96    if !opts.no_eof {
97        let mut tail = [0u8; 28];
98        f.seek(SeekFrom::End(-28))?;
99        f.read_exact(&mut tail)?;
100        if tail != BGZF_EOF {
101            return Err(QuickcheckError::MissingEof);
102        }
103    }
104
105    Ok(())
106}