genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
use crate::error::Result;

/// Scan a BAM file to find all BGZF block starting positions
///
/// This performs a fast sequential scan of the compressed file looking for
/// BGZF block boundaries (gzip magic number 0x1f 0x8b). This is used for
/// chunk-based parallel processing where each thread processes a range of blocks.
///
/// # Performance
/// This is much faster than decompressing - typically takes <1 second for a 1GB file.
/// The scan just reads the compressed data looking for magic bytes.
///
/// # Returns
/// Vector of byte offsets where each BGZF block starts in the compressed file.
/// The first offset is always 0 (start of file after BAM header).
///
/// # Example
/// ```ignore
/// let blocks = scan_bgzf_blocks("alignments.bam")?;
/// // blocks = [0, 45231, 89445, 134567, ...]
/// // Divide these blocks among threads for parallel processing
/// ```
pub fn scan_bgzf_blocks<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<u64>> {
    use std::fs::File;
    use std::io::{BufReader, Read};

    let file = File::open(path)?;
    let file_size = file.metadata()?.len();
    let mut reader = BufReader::with_capacity(1024 * 1024, file); // 1MB buffer for fast scanning

    const GZIP_MAGIC: [u8; 2] = [0x1f, 0x8b];
    const GZIP_METHOD_DEFLATE: u8 = 8;
    const GZIP_FLAG_FEXTRA: u8 = 0x04; // BGZF requires extra field
    
    let mut blocks = Vec::new();
    let mut pos: u64 = 0;

    // Read through file looking for valid BGZF headers
    while pos < file_size {
        let mut header = [0u8; 10];
        
        // Try to read 10-byte gzip header
        match reader.read_exact(&mut header) {
            Ok(_) => {
                // Check if this looks like a valid BGZF block header
                if header[0] == GZIP_MAGIC[0] 
                    && header[1] == GZIP_MAGIC[1]
                    && header[2] == GZIP_METHOD_DEFLATE
                    && (header[3] & GZIP_FLAG_FEXTRA) != 0
                {
                    // This looks like a valid BGZF block!
                    blocks.push(pos);
                    
                    // Now we need to skip to the end of this block
                    // Read the extra field to get block size
                    let mut xlen_buf = [0u8; 2];
                    if reader.read_exact(&mut xlen_buf).is_ok() {
                        let xlen = u16::from_le_bytes(xlen_buf) as usize;
                        
                        // Read extra field to find BSIZE
                        let mut extra = vec![0u8; xlen];
                        if reader.read_exact(&mut extra).is_ok() {
                            // Parse BGZF extra field to get block size
                            if let Ok(bsize) = parse_bgzf_bsize(&extra) {
                                // Skip to end of this block
                                // bsize = total block size - 1
                                // We've read: header(10) + xlen(2) + extra(xlen) = 12 + xlen
                                // Remaining: bsize + 1 - (12 + xlen)
                                let remaining = (bsize + 1) as i64 - (12 + xlen as i64);
                                if remaining > 0 {
                                    // Skip the compressed data + footer
                                    let mut skip_buf = vec![0u8; remaining as usize];
                                    if reader.read_exact(&mut skip_buf).is_ok() {
                                        pos += (bsize + 1) as u64;
                                        continue;
                                    }
                                }
                            }
                        }
                    }
                    
                    // If we couldn't parse the block properly, just advance 1 byte
                    pos += 1;
                } else {
                    // Not a valid BGZF header, advance 1 byte
                    pos += 1;
                }
            }
            Err(_) => break, // EOF or error
        }
    }

    Ok(blocks)
}

/// Parse BGZF extra field to extract block size (BSIZE)
fn parse_bgzf_bsize(extra: &[u8]) -> Result<usize> {
    let mut pos = 0;
    
    while pos + 4 <= extra.len() {
        let si1 = extra[pos];
        let si2 = extra[pos + 1];
        let slen = u16::from_le_bytes([extra[pos + 2], extra[pos + 3]]) as usize;
        
        // Check for BGZF subfield (SI1='B'=66, SI2='C'=67)
        if si1 == 66 && si2 == 67 {
            if pos + 4 + slen > extra.len() || slen != 2 {
                return Err(crate::error::Error::InvalidInput(
                    "Invalid BGZF extra field".to_string()
                ));
            }
            
            let bsize = u16::from_le_bytes([extra[pos + 4], extra[pos + 5]]) as usize;
            return Ok(bsize);
        }
        
        pos += 4 + slen;
    }
    
    Err(crate::error::Error::InvalidInput(
        "BGZF subfield not found".to_string()
    ))
}