genomicframe-core 0.2.0

//! BAM (Binary Alignment/Map) format reader
//!
//! BAM files use BGZF (Blocked GZIP Format) compression which divides data into
//! independent compressed blocks (max 64KB uncompressed). This enables:
//! - Random access via indexes
//! - Parallel decompression
//! - Streaming decompression with O(1) memory
//!
//! # Design Philosophy
//!
//! - **Streaming by default**: Records are never buffered unless explicitly requested
//! - **O(1) memory**: Process multi-GB files with constant memory (one 64KB BGZF block)
//! - **Lazy evaluation**: Parse alignments on-demand as you iterate
//! - **BGZF streaming**: Automatic decompression of blocked gzip format
//!
//! # Examples
//!
//! ```no_run
//! use genomicframe_core::formats::bam::reader::BamReader;
//! use genomicframe_core::core::GenomicRecordIterator;
//!
//! // Streaming: O(1) memory, processes one alignment at a time
//! let mut reader = BamReader::from_path("alignments.bam")?;
//!
//! // Read the header first (required)
//! let header = reader.read_header()?;
//! println!("References: {}", header.references.len());
//!
//! // Iterate through alignments
//! while let Some(record) = reader.next_record()? {
//!     if !record.is_unmapped() && record.mapq >= 30 {
//!         println!("{}: {} at {}", record.qname, record.cigar_string(), record.pos);
//!     }
//! }
//! # Ok::<(), genomicframe_core::error::Error>(())
//! ```

use crate::core::{GenomicReader, GenomicRecordIterator};
use crate::error::{Error, Result};
use flate2::{Decompress, FlushDecompress};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::Path;

/// Maximum size of an uncompressed BGZF block (64KB)
const MAX_BLOCK_SIZE: usize = 65536;

/// BAM file magic number "BAM\1"
const BAM_MAGIC: [u8; 4] = [b'B', b'A', b'M', 1];

/// Lookup table for BAM sequence decoding (4-bit to ASCII)
/// BAM encoding: =ACMGRSVTWYHKDBN (indices 0-15)
const SEQ_LOOKUP: [u8; 16] = [
    b'=', b'A', b'C', b'M', b'G', b'R', b'S', b'V', b'T', b'W', b'Y', b'H', b'K', b'D', b'B', b'N',
];

/// CIGAR operation
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CigarOp {
    /// Match or mismatch (M)
    Match(u32),
    /// Insertion to the reference (I)
    Ins(u32),
    /// Deletion from the reference (D)
    Del(u32),
    /// Skipped region from the reference (N)
    RefSkip(u32),
    /// Soft clipping (S)
    SoftClip(u32),
    /// Hard clipping (H)
    HardClip(u32),
    /// Padding (P)
    Pad(u32),
    /// Sequence match (=)
    Equal(u32),
    /// Sequence mismatch (X)
    Diff(u32),
}

impl CigarOp {
    /// Parse CIGAR operation from encoded value
    pub fn from_encoded(encoded: u32) -> Result<Self> {
        let len = encoded >> 4;
        let op = (encoded & 0xF) as u8;

        match op {
            0 => Ok(CigarOp::Match(len)),
            1 => Ok(CigarOp::Ins(len)),
            2 => Ok(CigarOp::Del(len)),
            3 => Ok(CigarOp::RefSkip(len)),
            4 => Ok(CigarOp::SoftClip(len)),
            5 => Ok(CigarOp::HardClip(len)),
            6 => Ok(CigarOp::Pad(len)),
            7 => Ok(CigarOp::Equal(len)),
            8 => Ok(CigarOp::Diff(len)),
            _ => Err(Error::InvalidInput(format!(
                "Invalid CIGAR operation: {}",
                op
            ))),
        }
    }

    /// Get the length of this operation
    pub fn len(&self) -> u32 {
        match self {
            CigarOp::Match(l)
            | CigarOp::Ins(l)
            | CigarOp::Del(l)
            | CigarOp::RefSkip(l)
            | CigarOp::SoftClip(l)
            | CigarOp::HardClip(l)
            | CigarOp::Pad(l)
            | CigarOp::Equal(l)
            | CigarOp::Diff(l) => *l,
        }
    }

    /// Returns true if this operation consumes reference bases
    pub fn consumes_ref(&self) -> bool {
        matches!(
            self,
            CigarOp::Match(_)
                | CigarOp::Del(_)
                | CigarOp::RefSkip(_)
                | CigarOp::Equal(_)
                | CigarOp::Diff(_)
        )
    }

    /// Returns true if this operation consumes query bases
    pub fn consumes_query(&self) -> bool {
        matches!(
            self,
            CigarOp::Match(_)
                | CigarOp::Ins(_)
                | CigarOp::SoftClip(_)
                | CigarOp::Equal(_)
                | CigarOp::Diff(_)
        )
    }

    /// Convert to single-character string representation
    pub fn to_char(&self) -> char {
        match self {
            CigarOp::Match(_) => 'M',
            CigarOp::Ins(_) => 'I',
            CigarOp::Del(_) => 'D',
            CigarOp::RefSkip(_) => 'N',
            CigarOp::SoftClip(_) => 'S',
            CigarOp::HardClip(_) => 'H',
            CigarOp::Pad(_) => 'P',
            CigarOp::Equal(_) => '=',
            CigarOp::Diff(_) => 'X',
        }
    }

    /// Parse CIGAR string (e.g., "10M2I5D") into a vector of operations
    ///
    /// This is used by SAM parsing. BAM uses binary encoded CIGAR operations.
    pub fn parse_cigar_string(cigar_str: &str) -> Result<Vec<Self>> {
        // Handle special case: "*" means no CIGAR
        if cigar_str == "*" {
            return Ok(Vec::new());
        }

        let mut operations = Vec::new();
        let mut num_str = String::new();

        for ch in cigar_str.chars() {
            if ch.is_ascii_digit() {
                num_str.push(ch);
            } else {
                if num_str.is_empty() {
                    return Err(Error::Parse(format!(
                        "Invalid CIGAR string: operation '{}' without length",
                        ch
                    )));
                }

                let len = num_str
                    .parse::<u32>()
                    .map_err(|_| Error::Parse(format!("Invalid CIGAR length: {}", num_str)))?;

                let op = match ch {
                    'M' => CigarOp::Match(len),
                    'I' => CigarOp::Ins(len),
                    'D' => CigarOp::Del(len),
                    'N' => CigarOp::RefSkip(len),
                    'S' => CigarOp::SoftClip(len),
                    'H' => CigarOp::HardClip(len),
                    'P' => CigarOp::Pad(len),
                    '=' => CigarOp::Equal(len),
                    'X' => CigarOp::Diff(len),
                    _ => return Err(Error::Parse(format!("Invalid CIGAR operation: {}", ch))),
                };

                operations.push(op);
                num_str.clear();
            }
        }

        if !num_str.is_empty() {
            return Err(Error::Parse(format!(
                "Invalid CIGAR string: trailing digits '{}'",
                num_str
            )));
        }

        Ok(operations)
    }
}

impl std::fmt::Display for CigarOp {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}{}", self.len(), self.to_char())
    }
}

/// Reference sequence metadata from BAM header
#[derive(Debug, Clone)]
pub struct RefSeq {
    pub name: String,
    pub length: u32,
}

/// BAM file header
#[derive(Debug, Clone)]
pub struct BamHeader {
    /// SAM header text
    pub text: String,
    /// Reference sequences
    pub references: Vec<RefSeq>,
}

impl BamHeader {
    /// Get reference name by index
    pub fn ref_name(&self, idx: i32) -> Option<&str> {
        if idx < 0 {
            return None;
        }
        self.references.get(idx as usize).map(|r| r.name.as_str())
    }
}

/// BAM alignment record
#[derive(Debug, Clone)]
pub struct BamRecord {
    /// Query template name
    pub qname: String,
    /// Bitwise FLAG
    pub flag: u16,
    /// Reference sequence ID (-1 if unmapped)
    pub refid: i32,
    /// Reference sequence name (resolved from refid during parsing)
    /// "*" for unmapped reads
    pub rname: String,
    /// 0-based leftmost mapping position (-1 if unmapped)
    pub pos: i32,
    /// Mapping quality (255 if unavailable)
    pub mapq: u8,
    /// CIGAR operations
    pub cigar: Vec<CigarOp>,
    /// Reference ID of the mate/next read (-1 if unavailable)
    pub next_refid: i32,
    /// Position of the mate/next read (-1 if unavailable)
    pub next_pos: i32,
    /// Template length
    pub tlen: i32,
    /// Segment sequence (empty if unavailable)
    pub seq: Vec<u8>,
    /// Phred-scaled base qualities (empty if unavailable)
    pub qual: Vec<u8>,
    /// Optional tags
    pub tags: HashMap<String, TagValue>,
}

impl BamRecord {
    /// Check if read is paired
    pub fn is_paired(&self) -> bool {
        (self.flag & 0x1) != 0
    }

    /// Check if read is mapped in proper pair
    pub fn is_proper_pair(&self) -> bool {
        (self.flag & 0x2) != 0
    }

    /// Check if read is unmapped
    pub fn is_unmapped(&self) -> bool {
        (self.flag & 0x4) != 0
    }

    /// Check if mate is unmapped
    pub fn is_mate_unmapped(&self) -> bool {
        (self.flag & 0x8) != 0
    }

    /// Check if read is reverse strand
    pub fn is_reverse(&self) -> bool {
        (self.flag & 0x10) != 0
    }

    /// Check if mate is reverse strand
    pub fn is_mate_reverse(&self) -> bool {
        (self.flag & 0x20) != 0
    }

    /// Check if read is first in pair
    pub fn is_first_in_pair(&self) -> bool {
        (self.flag & 0x40) != 0
    }

    /// Check if read is second in pair
    pub fn is_second_in_pair(&self) -> bool {
        (self.flag & 0x80) != 0
    }

    /// Check if alignment is secondary
    pub fn is_secondary(&self) -> bool {
        (self.flag & 0x100) != 0
    }

    /// Check if read fails quality checks
    pub fn is_qc_fail(&self) -> bool {
        (self.flag & 0x200) != 0
    }

    /// Check if read is a PCR or optical duplicate
    pub fn is_duplicate(&self) -> bool {
        (self.flag & 0x400) != 0
    }

    /// Check if alignment is supplementary
    pub fn is_supplementary(&self) -> bool {
        (self.flag & 0x800) != 0
    }

    /// Get CIGAR string representation
    pub fn cigar_string(&self) -> String {
        self.cigar.iter().map(|op| op.to_string()).collect()
    }

    /// Get sequence as string
    pub fn seq_string(&self) -> String {
        String::from_utf8_lossy(&self.seq).to_string()
    }
}

/// Optional tag value types
#[derive(Debug, Clone)]
pub enum TagValue {
    Char(u8),
    Int(i32),
    Float(f32),
    String(String),
    ByteArray(Vec<u8>),
}

/// BAM file reader with BGZF decompression
///
/// This reader provides streaming access to BAM files,
/// automatically decompressing BGZF blocks for memory efficiency.
///
/// Optimized with reusable decompressor state to avoid repeated allocations.
pub struct BamReader<R: Read> {
    inner: BufReader<R>,
    current_block: Vec<u8>,
    block_pos: usize,
    eof: bool,
    header: Option<BamHeader>,
    /// Reusable decompressor (avoids creating new decoder for each block)
    decompressor: Decompress,
    /// Reusable buffer for compressed data (avoids per-block allocation)
    compressed_buf: Vec<u8>,
    /// Number of BGZF blocks decompressed (for chunk-based parallelism)
    blocks_decompressed: usize,
    /// Optional path to the file (for parallel processing)
    file_path: Option<std::path::PathBuf>,
}

impl BamReader<File> {
    /// Open a BAM file from a path
    ///
    /// # Examples
    /// ```no_run
    /// use genomicframe_core::formats::bam::BamReader;
    ///
    /// let reader = BamReader::from_path("alignments.bam")?;
    /// # Ok::<(), genomicframe_core::error::Error>(())
    /// ```
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path_buf = path.as_ref().to_path_buf();
        let file = File::open(&path_buf)?;
        let mut reader = Self::new(file);
        reader.file_path = Some(path_buf);
        Ok(reader)
    }

    pub fn compute_stats(
        self,
        config: Option<crate::parallel::ParallelConfig>,
    ) -> Result<super::stats::BamStats> {
        // Use the stored path for parallel processing
        super::stats::BamStats::par_compute(self, config)
    }
}

impl<R: Read> BamReader<R> {
    /// Create a new BAM reader from any Read source
    pub fn new(reader: R) -> Self {
        Self {
            inner: BufReader::new(reader),
            current_block: Vec::new(),
            block_pos: 0,
            eof: false,
            header: None,
            decompressor: Decompress::new(false), // false = raw deflate (no zlib wrapper)
            compressed_buf: Vec::with_capacity(MAX_BLOCK_SIZE),
            blocks_decompressed: 0,
            file_path: None,
        }
    }

    /// Read and parse the BAM header
    ///
    /// This must be called before reading records
    pub fn read_header(&mut self) -> Result<&BamHeader> {
        if self.header.is_some() {
            return Ok(self.header.as_ref().unwrap());
        }

        // Read magic number (4 bytes: "BAM\1")
        let mut magic = [0u8; 4];
        self.read_exact(&mut magic)?;

        if magic != BAM_MAGIC {
            return Err(Error::InvalidInput(format!(
                "Invalid BAM magic number: expected {:?}, got {:?}",
                BAM_MAGIC, magic
            )));
        }

        // Read SAM header text length
        let mut l_text_buf = [0u8; 4];
        self.read_exact(&mut l_text_buf)?;
        let l_text = u32::from_le_bytes(l_text_buf) as usize;

        // Read SAM header text
        let mut text_buf = vec![0u8; l_text];
        self.read_exact(&mut text_buf)?;
        let text = String::from_utf8_lossy(&text_buf).to_string();

        // Read number of reference sequences
        let mut n_ref_buf = [0u8; 4];
        self.read_exact(&mut n_ref_buf)?;
        let n_ref = u32::from_le_bytes(n_ref_buf) as usize;

        // Read reference sequences
        let mut references = Vec::with_capacity(n_ref);
        for _ in 0..n_ref {
            // Read reference name length
            let mut l_name_buf = [0u8; 4];
            self.read_exact(&mut l_name_buf)?;
            let l_name = u32::from_le_bytes(l_name_buf) as usize;

            // Read reference name (null-terminated)
            let mut name_buf = vec![0u8; l_name];
            self.read_exact(&mut name_buf)?;
            // Remove null terminator
            if name_buf.last() == Some(&0) {
                name_buf.pop();
            }
            let name = String::from_utf8_lossy(&name_buf).to_string();

            // Read reference length
            let mut l_ref_buf = [0u8; 4];
            self.read_exact(&mut l_ref_buf)?;
            let length = u32::from_le_bytes(l_ref_buf);

            references.push(RefSeq { name, length });
        }

        self.header = Some(BamHeader { text, references });
        Ok(self.header.as_ref().unwrap())
    }

    /// Get the header (must call read_header first)
    pub fn header(&self) -> Option<&BamHeader> {
        self.header.as_ref()
    }

    /// Set the header manually (for parallel processing)
    ///
    /// This is used when creating a BAM reader that starts mid-file at a BGZF
    /// block boundary. Since the reader won't parse the header itself, we need
    /// to provide it externally (cloned from the main thread).
    ///
    /// # Use Case
    /// Parallel BAM processing:
    /// 1. Main thread reads header once
    /// 2. Worker threads seek to BGZF boundaries
    /// 3. Worker threads call set_header() to provide header context
    /// 4. Worker threads can now parse records (refid lookups work)
    pub fn set_header(&mut self, header: BamHeader) {
        self.header = Some(header);
    }

    /// Get the number of BGZF blocks decompressed so far
    ///
    /// This is used for chunk-based parallel processing to know when
    /// a thread has processed its assigned blocks.
    pub fn blocks_decompressed(&self) -> usize {
        self.blocks_decompressed
    }

    /// Get the file path if the reader was opened from a path
    ///
    /// Returns `None` if the reader was created from a generic `Read` source.
    /// Used for parallel processing to reopen the file.
    pub fn file_path(&self) -> Option<&std::path::Path> {
        self.file_path.as_deref()
    }

    /// Parse a single BAM record from raw bytes
    pub fn parse_record(&self, data: &[u8]) -> Result<BamRecord> {
        // Manual parsing - much faster than Cursor
        let mut pos = 0;

        // Helper macro to read bytes and advance position
        macro_rules! read_bytes {
            ($n:expr) => {{
                if pos + $n > data.len() {
                    return Err(Error::InvalidInput(
                        "Unexpected end of BAM record".to_string(),
                    ));
                }
                let bytes = &data[pos..pos + $n];
                pos += $n;
                bytes
            }};
        }

        // Read block size (validates record, but not used)
        let _block_size = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());

        // Read core fields (36 bytes total after block_size)
        let refid = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());

        // Resolve reference name from refid using header
        let rname = if let Some(header) = &self.header {
            header
                .ref_name(refid)
                .unwrap_or("*") // Unmapped or invalid refid
                .to_string()
        } else {
            // Fallback if header not read (shouldn't happen in normal flow)
            if refid >= 0 {
                format!("refid_{}", refid)
            } else {
                "*".to_string()
            }
        };

        let record_pos = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
        let l_read_name = read_bytes!(1)[0];
        let mapq = read_bytes!(1)[0];
        let _bin = u16::from_le_bytes(read_bytes!(2).try_into().unwrap()); // BAI index bin (not used)
        let n_cigar_op = u16::from_le_bytes(read_bytes!(2).try_into().unwrap());
        let flag = u16::from_le_bytes(read_bytes!(2).try_into().unwrap());
        let l_seq = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
        let next_refid = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
        let next_pos = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
        let tlen = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());

        // Read query name
        let qname_bytes = read_bytes!(l_read_name as usize);
        // Remove null terminator
        let qname_len = if qname_bytes.last() == Some(&0) {
            qname_bytes.len() - 1
        } else {
            qname_bytes.len()
        };
        let qname = String::from_utf8_lossy(&qname_bytes[..qname_len]).to_string();

        // Read CIGAR operations (4 bytes each)
        let mut cigar = Vec::with_capacity(n_cigar_op as usize);
        for _ in 0..n_cigar_op {
            let encoded = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
            cigar.push(CigarOp::from_encoded(encoded)?);
        }

        // Read and decode sequence (4 bases per byte, packed)
        let seq_bytes_len = ((l_seq + 1) / 2) as usize;
        let seq_buf = read_bytes!(seq_bytes_len);

        // Decode sequence using lookup table
        let mut seq = Vec::with_capacity(l_seq as usize);
        for byte in seq_buf {
            seq.push(SEQ_LOOKUP[(byte >> 4) as usize]);
            seq.push(SEQ_LOOKUP[(byte & 0x0F) as usize]);
        }
        // Trim to exact length if odd
        seq.truncate(l_seq as usize);

        // Read quality scores
        let qual_bytes = read_bytes!(l_seq as usize);
        // Quality 0xFF means unavailable - check first byte for early exit
        let qual = if l_seq > 0 && qual_bytes[0] == 0xFF && qual_bytes.iter().all(|&q| q == 0xFF) {
            Vec::new()
        } else {
            qual_bytes.to_vec()
        };

        // Read optional tags (remaining bytes)
        let mut tags = HashMap::new();
        while pos < data.len() {
            // Need at least 3 bytes for tag (2) + type (1)
            if pos + 3 > data.len() {
                break;
            }

            // Read tag (2 ASCII characters)
            let tag_bytes = read_bytes!(2);
            let tag = String::from_utf8_lossy(tag_bytes).to_string();

            // Read value type
            let val_type = read_bytes!(1)[0];

            let value = match val_type {
                b'A' => TagValue::Char(read_bytes!(1)[0]),
                b'c' => TagValue::Int(i8::from_le_bytes(read_bytes!(1).try_into().unwrap()) as i32),
                b'C' => TagValue::Int(read_bytes!(1)[0] as i32),
                b's' => {
                    TagValue::Int(i16::from_le_bytes(read_bytes!(2).try_into().unwrap()) as i32)
                }
                b'S' => {
                    TagValue::Int(u16::from_le_bytes(read_bytes!(2).try_into().unwrap()) as i32)
                }
                b'i' => TagValue::Int(i32::from_le_bytes(read_bytes!(4).try_into().unwrap())),
                b'I' => {
                    TagValue::Int(u32::from_le_bytes(read_bytes!(4).try_into().unwrap()) as i32)
                }
                b'f' => TagValue::Float(f32::from_le_bytes(read_bytes!(4).try_into().unwrap())),
                b'Z' => {
                    // Null-terminated string
                    let start = pos;
                    while pos < data.len() && data[pos] != 0 {
                        pos += 1;
                    }
                    let s = &data[start..pos];
                    if pos < data.len() {
                        pos += 1; // Skip null terminator
                    }
                    TagValue::String(String::from_utf8_lossy(s).to_string())
                }
                b'H' => {
                    // Hex string (same as Z)
                    let start = pos;
                    while pos < data.len() && data[pos] != 0 {
                        pos += 1;
                    }
                    let s = &data[start..pos];
                    if pos < data.len() {
                        pos += 1; // Skip null terminator
                    }
                    TagValue::String(String::from_utf8_lossy(s).to_string())
                }
                b'B' => {
                    // Array
                    let array_type = read_bytes!(1)[0];
                    let count = u32::from_le_bytes(read_bytes!(4).try_into().unwrap()) as usize;
                    let mut arr = Vec::new();
                    for _ in 0..count {
                        match array_type {
                            b'c' => arr.push(read_bytes!(1)[0]),
                            b'C' => arr.push(read_bytes!(1)[0]),
                            b's' => arr.extend_from_slice(read_bytes!(2)),
                            b'S' => arr.extend_from_slice(read_bytes!(2)),
                            b'i' => arr.extend_from_slice(read_bytes!(4)),
                            b'I' => arr.extend_from_slice(read_bytes!(4)),
                            b'f' => arr.extend_from_slice(read_bytes!(4)),
                            _ => {
                                return Err(Error::InvalidInput(format!(
                                    "Unknown array type: {}",
                                    array_type
                                )))
                            }
                        }
                    }
                    TagValue::ByteArray(arr)
                }
                _ => {
                    return Err(Error::InvalidInput(format!(
                        "Unknown tag type: {}",
                        val_type as char
                    )))
                }
            };

            tags.insert(tag, value);
        }

        Ok(BamRecord {
            qname,
            flag,
            refid,
            rname,
            pos: record_pos,
            mapq,
            cigar,
            next_refid,
            next_pos,
            tlen,
            seq,
            qual,
            tags,
        })
    }

    /// Create an iterator over BAM records
    ///
    /// This provides an alternative to using `next_record()` directly
    pub fn records(&mut self) -> BamRecordIterator<'_, R> {
        BamRecordIterator { reader: self }
    }

    /// Read exact number of bytes (handles BGZF block boundaries)
    fn read_exact(&mut self, buf: &mut [u8]) -> std::io::Result<()> {
        let mut total_read = 0;
        while total_read < buf.len() {
            let n = self.read(&mut buf[total_read..])?;
            if n == 0 {
                return Err(std::io::Error::new(
                    std::io::ErrorKind::UnexpectedEof,
                    "unexpected end of file",
                ));
            }
            total_read += n;
        }
        Ok(())
    }

    /// Read and decompress the next bam block
    ///
    /// Returns true if a block was read, false on EOF
    fn read_next_block(&mut self) -> Result<bool> {
        if self.eof {
            return Ok(false);
        }

        // Read gzip header (10 bytes minimum)
        let mut header = [0u8; 10];
        match self.inner.read_exact(&mut header) {
            Ok(_) => {}
            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
                self.eof = true;
                return Ok(false);
            }
            Err(e) => return Err(Error::Io(e)),
        }

        // Verify gzip magic number
        if header[0] != 31 || header[1] != 139 {
            return Err(Error::InvalidInput(
                "Invalid bam block: bad gzip magic number".to_string(),
            ));
        }

        // Check for extra field (bam requires this)
        let flags = header[3];
        let has_extra = (flags & 0x04) != 0;

        if !has_extra {
            return Err(Error::InvalidInput(
                "Invalid bam block: missing extra field".to_string(),
            ));
        }

        // Read extra field length
        let mut xlen_buf = [0u8; 2];
        self.inner.read_exact(&mut xlen_buf)?;
        let xlen = u16::from_le_bytes(xlen_buf) as usize;

        // Read extra field (contains bam block size)
        let mut extra = vec![0u8; xlen];
        self.inner.read_exact(&mut extra)?;

        // Parse bam subfield to get block size
        let bsize = self.parse_bam_extra(&extra)?;

        // Calculate compressed data size
        // bsize = total block size - 1
        // header (10) + xlen (2) + extra (xlen) + compressed data + footer (8) = bsize + 1
        let compressed_size = bsize + 1 - 10 - 2 - xlen - 8;

        // Read compressed data into reusable buffer
        self.compressed_buf.clear();
        self.compressed_buf.resize(compressed_size, 0);
        self.inner.read_exact(&mut self.compressed_buf)?;

        // Read footer (CRC32 and uncompressed size)
        let mut footer = [0u8; 8];
        self.inner.read_exact(&mut footer)?;

        // Reset decompressor for new block
        self.decompressor.reset(false);

        // Decompress the block using reusable decompressor
        // Allocate sufficient space for decompressed output
        self.current_block.clear();
        self.current_block.resize(MAX_BLOCK_SIZE, 0);

        // Decompress directly into the buffer
        let before_out = self.decompressor.total_out();

        self.decompressor
            .decompress(
                &self.compressed_buf,
                &mut self.current_block,
                FlushDecompress::Finish,
            )
            .map_err(|e| Error::InvalidInput(format!("Decompression failed: {}", e)))?;

        // Calculate actual bytes decompressed
        let bytes_out = (self.decompressor.total_out() - before_out) as usize;

        // Truncate to actual decompressed size
        self.current_block.truncate(bytes_out);

        // Verify uncompressed size doesn't exceed BGZF limit
        if self.current_block.len() > MAX_BLOCK_SIZE {
            return Err(Error::InvalidInput(format!(
                "BGZF block too large: {} bytes (max {})",
                self.current_block.len(),
                MAX_BLOCK_SIZE
            )));
        }

        self.block_pos = 0;
        self.blocks_decompressed += 1;
        Ok(true)
    }

    /// Parse bam extra field to extract block size
    fn parse_bam_extra(&self, extra: &[u8]) -> Result<usize> {
        let mut pos = 0;

        while pos + 4 <= extra.len() {
            let si1 = extra[pos];
            let si2 = extra[pos + 1];
            let slen = u16::from_le_bytes([extra[pos + 2], extra[pos + 3]]) as usize;

            // Check for bam subfield (SI1='B', SI2='C')
            if si1 == 66 && si2 == 67 {
                if pos + 4 + slen > extra.len() {
                    return Err(Error::InvalidInput("bam extra field truncated".to_string()));
                }

                // bam subfield should be 2 bytes (block size - 1)
                if slen != 2 {
                    return Err(Error::InvalidInput(format!(
                        "Invalid bam subfield length: {}",
                        slen
                    )));
                }

                let bsize = u16::from_le_bytes([extra[pos + 4], extra[pos + 5]]) as usize;
                return Ok(bsize);
            }

            pos += 4 + slen;
        }

        Err(Error::InvalidInput(
            "bam subfield not found in extra field".to_string(),
        ))
    }
}

impl<R: Read> Read for BamReader<R> {
    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        if buf.is_empty() {
            return Ok(0);
        }

        // If current block is exhausted, read next one
        if self.block_pos >= self.current_block.len() {
            match self.read_next_block() {
                Ok(true) => {}
                Ok(false) => return Ok(0), // EOF
                Err(e) => return Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
            }
        }

        // Copy data from current block
        let available = self.current_block.len() - self.block_pos;
        let to_copy = buf.len().min(available);

        buf[..to_copy]
            .copy_from_slice(&self.current_block[self.block_pos..self.block_pos + to_copy]);
        self.block_pos += to_copy;

        Ok(to_copy)
    }
}

// Implement GenomicRecordIterator trait (like VCF and FASTQ)
impl<R: Read> GenomicRecordIterator for BamReader<R> {
    type Record = BamRecord;

    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
        // Read record size (4 bytes)
        let mut size_buf = [0u8; 4];
        match self.read_exact(&mut size_buf) {
            Ok(_) => {}
            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
                return Ok(None);
            }
            Err(e) => return Err(e.into()),
        }

        let block_size = u32::from_le_bytes(size_buf);

        // Read record data
        let mut record_data = vec![0u8; block_size as usize + 4]; // +4 for block_size field
        record_data[0..4].copy_from_slice(&size_buf);
        self.read_exact(&mut record_data[4..])?;
        Ok(Some(record_data))
    }

    fn next_record(&mut self) -> Result<Option<Self::Record>> {
        // Read record size (4 bytes)
        let record_data = self.next_raw()?;
        if record_data.is_none() {
            return Ok(None);
        }
        let record_data = record_data.unwrap();
        // Parse record
        self.parse_record(&record_data).map(Some)
    }
}

// Implement GenomicReader trait
impl<R: Read> GenomicReader for BamReader<R> {
    type Metadata = BamHeader;

    fn metadata(&self) -> &Self::Metadata {
        self.header
            .as_ref()
            .expect("Header not read yet. Call read_header() first.")
    }
}

/// Iterator over BAM records
///
/// This provides an alternative to using `next_record()` directly.
/// Created by calling `reader.records()`.
pub struct BamRecordIterator<'a, R: Read> {
    reader: &'a mut BamReader<R>,
}

impl<'a, R: Read> Iterator for BamRecordIterator<'a, R> {
    type Item = Result<BamRecord>;

    fn next(&mut self) -> Option<Self::Item> {
        match self.reader.next_record() {
            Ok(Some(record)) => Some(Ok(record)),
            Ok(None) => None,
            Err(e) => Some(Err(e)),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    #[test]
    fn test_cigar_op_encoding() {
        // Test CIGAR operation encoding/decoding
        let encoded = (10 << 4) | 0; // 10M
        let op = CigarOp::from_encoded(encoded).unwrap();
        assert_eq!(op, CigarOp::Match(10));
        assert_eq!(op.len(), 10);
        assert!(op.consumes_ref());
        assert!(op.consumes_query());
        assert_eq!(op.to_char(), 'M');
    }

    #[test]
    fn test_cigar_op_display() {
        assert_eq!(CigarOp::Match(10).to_string(), "10M");
        assert_eq!(CigarOp::Ins(5).to_string(), "5I");
        assert_eq!(CigarOp::Del(3).to_string(), "3D");
    }

    #[test]
    fn test_bam_reader_creation() {
        let data = vec![0u8; 1024];
        let cursor = Cursor::new(data);
        let _reader = BamReader::new(cursor);
        // Should not panic
    }

    #[test]
    fn test_bam_reader_invalid_magic() {
        // Not a valid BGZF file
        let data = vec![0u8; 100];
        let cursor = Cursor::new(data);
        let mut reader = BamReader::new(cursor);

        let mut buf = vec![0u8; 10];
        let result = reader.read(&mut buf);

        // Should fail because it's not valid BGZF
        assert!(result.is_err() || result.unwrap() == 0);
    }

    #[test]
    fn test_parse_bgzf_extra_valid() {
        let reader = BamReader::new(Cursor::new(vec![]));

        // Valid BGZF extra field: SI1=B(66), SI2=C(67), SLEN=2, BSIZE=100
        let extra = vec![66, 67, 2, 0, 100, 0];
        let result = reader.parse_bam_extra(&extra);

        assert!(result.is_ok());
        assert_eq!(result.unwrap(), 100);
    }

    #[test]
    fn test_parse_bgzf_extra_invalid() {
        let reader = BamReader::new(Cursor::new(vec![]));

        // Invalid - no BGZF signature
        let extra = vec![1, 2, 2, 0, 100, 0];
        let result = reader.parse_bam_extra(&extra);

        assert!(result.is_err());
    }

    #[test]
    fn test_parse_bgzf_extra_truncated() {
        let reader = BamReader::new(Cursor::new(vec![]));

        // Truncated extra field
        let extra = vec![66, 67, 2, 0];
        let result = reader.parse_bam_extra(&extra);

        assert!(result.is_err());
    }

    #[test]
    fn test_max_block_size() {
        assert_eq!(MAX_BLOCK_SIZE, 65536);
    }

    #[test]
    fn test_bam_reader_from_empty() {
        let cursor = Cursor::new(vec![]);
        let mut reader = BamReader::new(cursor);

        let mut buf = vec![0u8; 10];
        let result = reader.read(&mut buf);

        // Should return 0 for EOF, not error
        assert!(result.is_ok());
        assert_eq!(result.unwrap(), 0);
    }

    // TODO: Add integration tests with actual BGZF-compressed data
}