genomicframe_core/formats/bam/
reader.rs

1//! BAM (Binary Alignment/Map) format reader
2//!
3//! BAM files use BGZF (Blocked GZIP Format) compression which divides data into
4//! independent compressed blocks (max 64KB uncompressed). This enables:
5//! - Random access via indexes
6//! - Parallel decompression
7//! - Streaming decompression with O(1) memory
8//!
9//! # Design Philosophy
10//!
11//! - **Streaming by default**: Records are never buffered unless explicitly requested
12//! - **O(1) memory**: Process multi-GB files with constant memory (one 64KB BGZF block)
13//! - **Lazy evaluation**: Parse alignments on-demand as you iterate
14//! - **BGZF streaming**: Automatic decompression of blocked gzip format
15//!
16//! # Examples
17//!
18//! ```no_run
19//! use genomicframe_core::formats::bam::reader::BamReader;
20//! use genomicframe_core::core::GenomicRecordIterator;
21//!
22//! // Streaming: O(1) memory, processes one alignment at a time
23//! let mut reader = BamReader::from_path("alignments.bam")?;
24//!
25//! // Read the header first (required)
26//! let header = reader.read_header()?;
27//! println!("References: {}", header.references.len());
28//!
29//! // Iterate through alignments
30//! while let Some(record) = reader.next_record()? {
31//!     if !record.is_unmapped() && record.mapq >= 30 {
32//!         println!("{}: {} at {}", record.qname, record.cigar_string(), record.pos);
33//!     }
34//! }
35//! # Ok::<(), genomicframe_core::error::Error>(())
36//! ```
37
38use crate::core::{GenomicReader, GenomicRecordIterator};
39use crate::error::{Error, Result};
40use flate2::{Decompress, FlushDecompress};
41use std::collections::HashMap;
42use std::fs::File;
43use std::io::{BufReader, Read};
44use std::path::Path;
45
46/// Maximum size of an uncompressed BGZF block (64KB)
47const MAX_BLOCK_SIZE: usize = 65536;
48
49/// BAM file magic number "BAM\1"
50const BAM_MAGIC: [u8; 4] = [b'B', b'A', b'M', 1];
51
52/// Lookup table for BAM sequence decoding (4-bit to ASCII)
53/// BAM encoding: =ACMGRSVTWYHKDBN (indices 0-15)
54const SEQ_LOOKUP: [u8; 16] = [
55    b'=', b'A', b'C', b'M', b'G', b'R', b'S', b'V', b'T', b'W', b'Y', b'H', b'K', b'D', b'B', b'N',
56];
57
58/// CIGAR operation
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum CigarOp {
61    /// Match or mismatch (M)
62    Match(u32),
63    /// Insertion to the reference (I)
64    Ins(u32),
65    /// Deletion from the reference (D)
66    Del(u32),
67    /// Skipped region from the reference (N)
68    RefSkip(u32),
69    /// Soft clipping (S)
70    SoftClip(u32),
71    /// Hard clipping (H)
72    HardClip(u32),
73    /// Padding (P)
74    Pad(u32),
75    /// Sequence match (=)
76    Equal(u32),
77    /// Sequence mismatch (X)
78    Diff(u32),
79}
80
81impl CigarOp {
82    /// Parse CIGAR operation from encoded value
83    pub fn from_encoded(encoded: u32) -> Result<Self> {
84        let len = encoded >> 4;
85        let op = (encoded & 0xF) as u8;
86
87        match op {
88            0 => Ok(CigarOp::Match(len)),
89            1 => Ok(CigarOp::Ins(len)),
90            2 => Ok(CigarOp::Del(len)),
91            3 => Ok(CigarOp::RefSkip(len)),
92            4 => Ok(CigarOp::SoftClip(len)),
93            5 => Ok(CigarOp::HardClip(len)),
94            6 => Ok(CigarOp::Pad(len)),
95            7 => Ok(CigarOp::Equal(len)),
96            8 => Ok(CigarOp::Diff(len)),
97            _ => Err(Error::InvalidInput(format!(
98                "Invalid CIGAR operation: {}",
99                op
100            ))),
101        }
102    }
103
104    /// Get the length of this operation
105    pub fn len(&self) -> u32 {
106        match self {
107            CigarOp::Match(l)
108            | CigarOp::Ins(l)
109            | CigarOp::Del(l)
110            | CigarOp::RefSkip(l)
111            | CigarOp::SoftClip(l)
112            | CigarOp::HardClip(l)
113            | CigarOp::Pad(l)
114            | CigarOp::Equal(l)
115            | CigarOp::Diff(l) => *l,
116        }
117    }
118
119    /// Returns true if this operation consumes reference bases
120    pub fn consumes_ref(&self) -> bool {
121        matches!(
122            self,
123            CigarOp::Match(_)
124                | CigarOp::Del(_)
125                | CigarOp::RefSkip(_)
126                | CigarOp::Equal(_)
127                | CigarOp::Diff(_)
128        )
129    }
130
131    /// Returns true if this operation consumes query bases
132    pub fn consumes_query(&self) -> bool {
133        matches!(
134            self,
135            CigarOp::Match(_)
136                | CigarOp::Ins(_)
137                | CigarOp::SoftClip(_)
138                | CigarOp::Equal(_)
139                | CigarOp::Diff(_)
140        )
141    }
142
143    /// Convert to single-character string representation
144    pub fn to_char(&self) -> char {
145        match self {
146            CigarOp::Match(_) => 'M',
147            CigarOp::Ins(_) => 'I',
148            CigarOp::Del(_) => 'D',
149            CigarOp::RefSkip(_) => 'N',
150            CigarOp::SoftClip(_) => 'S',
151            CigarOp::HardClip(_) => 'H',
152            CigarOp::Pad(_) => 'P',
153            CigarOp::Equal(_) => '=',
154            CigarOp::Diff(_) => 'X',
155        }
156    }
157
158    /// Parse CIGAR string (e.g., "10M2I5D") into a vector of operations
159    ///
160    /// This is used by SAM parsing. BAM uses binary encoded CIGAR operations.
161    pub fn parse_cigar_string(cigar_str: &str) -> Result<Vec<Self>> {
162        // Handle special case: "*" means no CIGAR
163        if cigar_str == "*" {
164            return Ok(Vec::new());
165        }
166
167        let mut operations = Vec::new();
168        let mut num_str = String::new();
169
170        for ch in cigar_str.chars() {
171            if ch.is_ascii_digit() {
172                num_str.push(ch);
173            } else {
174                if num_str.is_empty() {
175                    return Err(Error::Parse(format!(
176                        "Invalid CIGAR string: operation '{}' without length",
177                        ch
178                    )));
179                }
180
181                let len = num_str
182                    .parse::<u32>()
183                    .map_err(|_| Error::Parse(format!("Invalid CIGAR length: {}", num_str)))?;
184
185                let op = match ch {
186                    'M' => CigarOp::Match(len),
187                    'I' => CigarOp::Ins(len),
188                    'D' => CigarOp::Del(len),
189                    'N' => CigarOp::RefSkip(len),
190                    'S' => CigarOp::SoftClip(len),
191                    'H' => CigarOp::HardClip(len),
192                    'P' => CigarOp::Pad(len),
193                    '=' => CigarOp::Equal(len),
194                    'X' => CigarOp::Diff(len),
195                    _ => return Err(Error::Parse(format!("Invalid CIGAR operation: {}", ch))),
196                };
197
198                operations.push(op);
199                num_str.clear();
200            }
201        }
202
203        if !num_str.is_empty() {
204            return Err(Error::Parse(format!(
205                "Invalid CIGAR string: trailing digits '{}'",
206                num_str
207            )));
208        }
209
210        Ok(operations)
211    }
212}
213
214impl std::fmt::Display for CigarOp {
215    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
216        write!(f, "{}{}", self.len(), self.to_char())
217    }
218}
219
220/// Reference sequence metadata from BAM header
221#[derive(Debug, Clone)]
222pub struct RefSeq {
223    pub name: String,
224    pub length: u32,
225}
226
227/// BAM file header
228#[derive(Debug, Clone)]
229pub struct BamHeader {
230    /// SAM header text
231    pub text: String,
232    /// Reference sequences
233    pub references: Vec<RefSeq>,
234}
235
236impl BamHeader {
237    /// Get reference name by index
238    pub fn ref_name(&self, idx: i32) -> Option<&str> {
239        if idx < 0 {
240            return None;
241        }
242        self.references.get(idx as usize).map(|r| r.name.as_str())
243    }
244}
245
246/// BAM alignment record
247#[derive(Debug, Clone)]
248pub struct BamRecord {
249    /// Query template name
250    pub qname: String,
251    /// Bitwise FLAG
252    pub flag: u16,
253    /// Reference sequence ID (-1 if unmapped)
254    pub refid: i32,
255    /// Reference sequence name (resolved from refid during parsing)
256    /// "*" for unmapped reads
257    pub rname: String,
258    /// 0-based leftmost mapping position (-1 if unmapped)
259    pub pos: i32,
260    /// Mapping quality (255 if unavailable)
261    pub mapq: u8,
262    /// CIGAR operations
263    pub cigar: Vec<CigarOp>,
264    /// Reference ID of the mate/next read (-1 if unavailable)
265    pub next_refid: i32,
266    /// Position of the mate/next read (-1 if unavailable)
267    pub next_pos: i32,
268    /// Template length
269    pub tlen: i32,
270    /// Segment sequence (empty if unavailable)
271    pub seq: Vec<u8>,
272    /// Phred-scaled base qualities (empty if unavailable)
273    pub qual: Vec<u8>,
274    /// Optional tags
275    pub tags: HashMap<String, TagValue>,
276}
277
278impl BamRecord {
279    /// Check if read is paired
280    pub fn is_paired(&self) -> bool {
281        (self.flag & 0x1) != 0
282    }
283
284    /// Check if read is mapped in proper pair
285    pub fn is_proper_pair(&self) -> bool {
286        (self.flag & 0x2) != 0
287    }
288
289    /// Check if read is unmapped
290    pub fn is_unmapped(&self) -> bool {
291        (self.flag & 0x4) != 0
292    }
293
294    /// Check if mate is unmapped
295    pub fn is_mate_unmapped(&self) -> bool {
296        (self.flag & 0x8) != 0
297    }
298
299    /// Check if read is reverse strand
300    pub fn is_reverse(&self) -> bool {
301        (self.flag & 0x10) != 0
302    }
303
304    /// Check if mate is reverse strand
305    pub fn is_mate_reverse(&self) -> bool {
306        (self.flag & 0x20) != 0
307    }
308
309    /// Check if read is first in pair
310    pub fn is_first_in_pair(&self) -> bool {
311        (self.flag & 0x40) != 0
312    }
313
314    /// Check if read is second in pair
315    pub fn is_second_in_pair(&self) -> bool {
316        (self.flag & 0x80) != 0
317    }
318
319    /// Check if alignment is secondary
320    pub fn is_secondary(&self) -> bool {
321        (self.flag & 0x100) != 0
322    }
323
324    /// Check if read fails quality checks
325    pub fn is_qc_fail(&self) -> bool {
326        (self.flag & 0x200) != 0
327    }
328
329    /// Check if read is a PCR or optical duplicate
330    pub fn is_duplicate(&self) -> bool {
331        (self.flag & 0x400) != 0
332    }
333
334    /// Check if alignment is supplementary
335    pub fn is_supplementary(&self) -> bool {
336        (self.flag & 0x800) != 0
337    }
338
339    /// Get CIGAR string representation
340    pub fn cigar_string(&self) -> String {
341        self.cigar.iter().map(|op| op.to_string()).collect()
342    }
343
344    /// Get sequence as string
345    pub fn seq_string(&self) -> String {
346        String::from_utf8_lossy(&self.seq).to_string()
347    }
348}
349
350/// Optional tag value types
351#[derive(Debug, Clone)]
352pub enum TagValue {
353    Char(u8),
354    Int(i32),
355    Float(f32),
356    String(String),
357    ByteArray(Vec<u8>),
358}
359
360/// BAM file reader with BGZF decompression
361///
362/// This reader provides streaming access to BAM files,
363/// automatically decompressing BGZF blocks for memory efficiency.
364///
365/// Optimized with reusable decompressor state to avoid repeated allocations.
366pub struct BamReader<R: Read> {
367    inner: BufReader<R>,
368    current_block: Vec<u8>,
369    block_pos: usize,
370    eof: bool,
371    header: Option<BamHeader>,
372    /// Reusable decompressor (avoids creating new decoder for each block)
373    decompressor: Decompress,
374    /// Reusable buffer for compressed data (avoids per-block allocation)
375    compressed_buf: Vec<u8>,
376    /// Number of BGZF blocks decompressed (for chunk-based parallelism)
377    blocks_decompressed: usize,
378    /// Optional path to the file (for parallel processing)
379    file_path: Option<std::path::PathBuf>,
380}
381
382impl BamReader<File> {
383    /// Open a BAM file from a path
384    ///
385    /// # Examples
386    /// ```no_run
387    /// use genomicframe_core::formats::bam::BamReader;
388    ///
389    /// let reader = BamReader::from_path("alignments.bam")?;
390    /// # Ok::<(), genomicframe_core::error::Error>(())
391    /// ```
392    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
393        let path_buf = path.as_ref().to_path_buf();
394        let file = File::open(&path_buf)?;
395        let mut reader = Self::new(file);
396        reader.file_path = Some(path_buf);
397        Ok(reader)
398    }
399
400    pub fn compute_stats(
401        self,
402        config: Option<crate::parallel::ParallelConfig>,
403    ) -> Result<super::stats::BamStats> {
404        // Use the stored path for parallel processing
405        super::stats::BamStats::par_compute(self, config)
406    }
407}
408
409impl<R: Read> BamReader<R> {
410    /// Create a new BAM reader from any Read source
411    pub fn new(reader: R) -> Self {
412        Self {
413            inner: BufReader::new(reader),
414            current_block: Vec::new(),
415            block_pos: 0,
416            eof: false,
417            header: None,
418            decompressor: Decompress::new(false), // false = raw deflate (no zlib wrapper)
419            compressed_buf: Vec::with_capacity(MAX_BLOCK_SIZE),
420            blocks_decompressed: 0,
421            file_path: None,
422        }
423    }
424
425    /// Read and parse the BAM header
426    ///
427    /// This must be called before reading records
428    pub fn read_header(&mut self) -> Result<&BamHeader> {
429        if self.header.is_some() {
430            return Ok(self.header.as_ref().unwrap());
431        }
432
433        // Read magic number (4 bytes: "BAM\1")
434        let mut magic = [0u8; 4];
435        self.read_exact(&mut magic)?;
436
437        if magic != BAM_MAGIC {
438            return Err(Error::InvalidInput(format!(
439                "Invalid BAM magic number: expected {:?}, got {:?}",
440                BAM_MAGIC, magic
441            )));
442        }
443
444        // Read SAM header text length
445        let mut l_text_buf = [0u8; 4];
446        self.read_exact(&mut l_text_buf)?;
447        let l_text = u32::from_le_bytes(l_text_buf) as usize;
448
449        // Read SAM header text
450        let mut text_buf = vec![0u8; l_text];
451        self.read_exact(&mut text_buf)?;
452        let text = String::from_utf8_lossy(&text_buf).to_string();
453
454        // Read number of reference sequences
455        let mut n_ref_buf = [0u8; 4];
456        self.read_exact(&mut n_ref_buf)?;
457        let n_ref = u32::from_le_bytes(n_ref_buf) as usize;
458
459        // Read reference sequences
460        let mut references = Vec::with_capacity(n_ref);
461        for _ in 0..n_ref {
462            // Read reference name length
463            let mut l_name_buf = [0u8; 4];
464            self.read_exact(&mut l_name_buf)?;
465            let l_name = u32::from_le_bytes(l_name_buf) as usize;
466
467            // Read reference name (null-terminated)
468            let mut name_buf = vec![0u8; l_name];
469            self.read_exact(&mut name_buf)?;
470            // Remove null terminator
471            if name_buf.last() == Some(&0) {
472                name_buf.pop();
473            }
474            let name = String::from_utf8_lossy(&name_buf).to_string();
475
476            // Read reference length
477            let mut l_ref_buf = [0u8; 4];
478            self.read_exact(&mut l_ref_buf)?;
479            let length = u32::from_le_bytes(l_ref_buf);
480
481            references.push(RefSeq { name, length });
482        }
483
484        self.header = Some(BamHeader { text, references });
485        Ok(self.header.as_ref().unwrap())
486    }
487
488    /// Get the header (must call read_header first)
489    pub fn header(&self) -> Option<&BamHeader> {
490        self.header.as_ref()
491    }
492
493    /// Set the header manually (for parallel processing)
494    ///
495    /// This is used when creating a BAM reader that starts mid-file at a BGZF
496    /// block boundary. Since the reader won't parse the header itself, we need
497    /// to provide it externally (cloned from the main thread).
498    ///
499    /// # Use Case
500    /// Parallel BAM processing:
501    /// 1. Main thread reads header once
502    /// 2. Worker threads seek to BGZF boundaries
503    /// 3. Worker threads call set_header() to provide header context
504    /// 4. Worker threads can now parse records (refid lookups work)
505    pub fn set_header(&mut self, header: BamHeader) {
506        self.header = Some(header);
507    }
508
509    /// Get the number of BGZF blocks decompressed so far
510    ///
511    /// This is used for chunk-based parallel processing to know when
512    /// a thread has processed its assigned blocks.
513    pub fn blocks_decompressed(&self) -> usize {
514        self.blocks_decompressed
515    }
516
517    /// Get the file path if the reader was opened from a path
518    ///
519    /// Returns `None` if the reader was created from a generic `Read` source.
520    /// Used for parallel processing to reopen the file.
521    pub fn file_path(&self) -> Option<&std::path::Path> {
522        self.file_path.as_deref()
523    }
524
525    /// Parse a single BAM record from raw bytes
526    pub fn parse_record(&self, data: &[u8]) -> Result<BamRecord> {
527        // Manual parsing - much faster than Cursor
528        let mut pos = 0;
529
530        // Helper macro to read bytes and advance position
531        macro_rules! read_bytes {
532            ($n:expr) => {{
533                if pos + $n > data.len() {
534                    return Err(Error::InvalidInput(
535                        "Unexpected end of BAM record".to_string(),
536                    ));
537                }
538                let bytes = &data[pos..pos + $n];
539                pos += $n;
540                bytes
541            }};
542        }
543
544        // Read block size (validates record, but not used)
545        let _block_size = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
546
547        // Read core fields (36 bytes total after block_size)
548        let refid = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
549
550        // Resolve reference name from refid using header
551        let rname = if let Some(header) = &self.header {
552            header
553                .ref_name(refid)
554                .unwrap_or("*") // Unmapped or invalid refid
555                .to_string()
556        } else {
557            // Fallback if header not read (shouldn't happen in normal flow)
558            if refid >= 0 {
559                format!("refid_{}", refid)
560            } else {
561                "*".to_string()
562            }
563        };
564
565        let record_pos = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
566        let l_read_name = read_bytes!(1)[0];
567        let mapq = read_bytes!(1)[0];
568        let _bin = u16::from_le_bytes(read_bytes!(2).try_into().unwrap()); // BAI index bin (not used)
569        let n_cigar_op = u16::from_le_bytes(read_bytes!(2).try_into().unwrap());
570        let flag = u16::from_le_bytes(read_bytes!(2).try_into().unwrap());
571        let l_seq = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
572        let next_refid = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
573        let next_pos = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
574        let tlen = i32::from_le_bytes(read_bytes!(4).try_into().unwrap());
575
576        // Read query name
577        let qname_bytes = read_bytes!(l_read_name as usize);
578        // Remove null terminator
579        let qname_len = if qname_bytes.last() == Some(&0) {
580            qname_bytes.len() - 1
581        } else {
582            qname_bytes.len()
583        };
584        let qname = String::from_utf8_lossy(&qname_bytes[..qname_len]).to_string();
585
586        // Read CIGAR operations (4 bytes each)
587        let mut cigar = Vec::with_capacity(n_cigar_op as usize);
588        for _ in 0..n_cigar_op {
589            let encoded = u32::from_le_bytes(read_bytes!(4).try_into().unwrap());
590            cigar.push(CigarOp::from_encoded(encoded)?);
591        }
592
593        // Read and decode sequence (4 bases per byte, packed)
594        let seq_bytes_len = ((l_seq + 1) / 2) as usize;
595        let seq_buf = read_bytes!(seq_bytes_len);
596
597        // Decode sequence using lookup table
598        let mut seq = Vec::with_capacity(l_seq as usize);
599        for byte in seq_buf {
600            seq.push(SEQ_LOOKUP[(byte >> 4) as usize]);
601            seq.push(SEQ_LOOKUP[(byte & 0x0F) as usize]);
602        }
603        // Trim to exact length if odd
604        seq.truncate(l_seq as usize);
605
606        // Read quality scores
607        let qual_bytes = read_bytes!(l_seq as usize);
608        // Quality 0xFF means unavailable - check first byte for early exit
609        let qual = if l_seq > 0 && qual_bytes[0] == 0xFF && qual_bytes.iter().all(|&q| q == 0xFF) {
610            Vec::new()
611        } else {
612            qual_bytes.to_vec()
613        };
614
615        // Read optional tags (remaining bytes)
616        let mut tags = HashMap::new();
617        while pos < data.len() {
618            // Need at least 3 bytes for tag (2) + type (1)
619            if pos + 3 > data.len() {
620                break;
621            }
622
623            // Read tag (2 ASCII characters)
624            let tag_bytes = read_bytes!(2);
625            let tag = String::from_utf8_lossy(tag_bytes).to_string();
626
627            // Read value type
628            let val_type = read_bytes!(1)[0];
629
630            let value = match val_type {
631                b'A' => TagValue::Char(read_bytes!(1)[0]),
632                b'c' => TagValue::Int(i8::from_le_bytes(read_bytes!(1).try_into().unwrap()) as i32),
633                b'C' => TagValue::Int(read_bytes!(1)[0] as i32),
634                b's' => {
635                    TagValue::Int(i16::from_le_bytes(read_bytes!(2).try_into().unwrap()) as i32)
636                }
637                b'S' => {
638                    TagValue::Int(u16::from_le_bytes(read_bytes!(2).try_into().unwrap()) as i32)
639                }
640                b'i' => TagValue::Int(i32::from_le_bytes(read_bytes!(4).try_into().unwrap())),
641                b'I' => {
642                    TagValue::Int(u32::from_le_bytes(read_bytes!(4).try_into().unwrap()) as i32)
643                }
644                b'f' => TagValue::Float(f32::from_le_bytes(read_bytes!(4).try_into().unwrap())),
645                b'Z' => {
646                    // Null-terminated string
647                    let start = pos;
648                    while pos < data.len() && data[pos] != 0 {
649                        pos += 1;
650                    }
651                    let s = &data[start..pos];
652                    if pos < data.len() {
653                        pos += 1; // Skip null terminator
654                    }
655                    TagValue::String(String::from_utf8_lossy(s).to_string())
656                }
657                b'H' => {
658                    // Hex string (same as Z)
659                    let start = pos;
660                    while pos < data.len() && data[pos] != 0 {
661                        pos += 1;
662                    }
663                    let s = &data[start..pos];
664                    if pos < data.len() {
665                        pos += 1; // Skip null terminator
666                    }
667                    TagValue::String(String::from_utf8_lossy(s).to_string())
668                }
669                b'B' => {
670                    // Array
671                    let array_type = read_bytes!(1)[0];
672                    let count = u32::from_le_bytes(read_bytes!(4).try_into().unwrap()) as usize;
673                    let mut arr = Vec::new();
674                    for _ in 0..count {
675                        match array_type {
676                            b'c' => arr.push(read_bytes!(1)[0]),
677                            b'C' => arr.push(read_bytes!(1)[0]),
678                            b's' => arr.extend_from_slice(read_bytes!(2)),
679                            b'S' => arr.extend_from_slice(read_bytes!(2)),
680                            b'i' => arr.extend_from_slice(read_bytes!(4)),
681                            b'I' => arr.extend_from_slice(read_bytes!(4)),
682                            b'f' => arr.extend_from_slice(read_bytes!(4)),
683                            _ => {
684                                return Err(Error::InvalidInput(format!(
685                                    "Unknown array type: {}",
686                                    array_type
687                                )))
688                            }
689                        }
690                    }
691                    TagValue::ByteArray(arr)
692                }
693                _ => {
694                    return Err(Error::InvalidInput(format!(
695                        "Unknown tag type: {}",
696                        val_type as char
697                    )))
698                }
699            };
700
701            tags.insert(tag, value);
702        }
703
704        Ok(BamRecord {
705            qname,
706            flag,
707            refid,
708            rname,
709            pos: record_pos,
710            mapq,
711            cigar,
712            next_refid,
713            next_pos,
714            tlen,
715            seq,
716            qual,
717            tags,
718        })
719    }
720
721    /// Create an iterator over BAM records
722    ///
723    /// This provides an alternative to using `next_record()` directly
724    pub fn records(&mut self) -> BamRecordIterator<'_, R> {
725        BamRecordIterator { reader: self }
726    }
727
728    /// Read exact number of bytes (handles BGZF block boundaries)
729    fn read_exact(&mut self, buf: &mut [u8]) -> std::io::Result<()> {
730        let mut total_read = 0;
731        while total_read < buf.len() {
732            let n = self.read(&mut buf[total_read..])?;
733            if n == 0 {
734                return Err(std::io::Error::new(
735                    std::io::ErrorKind::UnexpectedEof,
736                    "unexpected end of file",
737                ));
738            }
739            total_read += n;
740        }
741        Ok(())
742    }
743
744    /// Read and decompress the next bam block
745    ///
746    /// Returns true if a block was read, false on EOF
747    fn read_next_block(&mut self) -> Result<bool> {
748        if self.eof {
749            return Ok(false);
750        }
751
752        // Read gzip header (10 bytes minimum)
753        let mut header = [0u8; 10];
754        match self.inner.read_exact(&mut header) {
755            Ok(_) => {}
756            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
757                self.eof = true;
758                return Ok(false);
759            }
760            Err(e) => return Err(Error::Io(e)),
761        }
762
763        // Verify gzip magic number
764        if header[0] != 31 || header[1] != 139 {
765            return Err(Error::InvalidInput(
766                "Invalid bam block: bad gzip magic number".to_string(),
767            ));
768        }
769
770        // Check for extra field (bam requires this)
771        let flags = header[3];
772        let has_extra = (flags & 0x04) != 0;
773
774        if !has_extra {
775            return Err(Error::InvalidInput(
776                "Invalid bam block: missing extra field".to_string(),
777            ));
778        }
779
780        // Read extra field length
781        let mut xlen_buf = [0u8; 2];
782        self.inner.read_exact(&mut xlen_buf)?;
783        let xlen = u16::from_le_bytes(xlen_buf) as usize;
784
785        // Read extra field (contains bam block size)
786        let mut extra = vec![0u8; xlen];
787        self.inner.read_exact(&mut extra)?;
788
789        // Parse bam subfield to get block size
790        let bsize = self.parse_bam_extra(&extra)?;
791
792        // Calculate compressed data size
793        // bsize = total block size - 1
794        // header (10) + xlen (2) + extra (xlen) + compressed data + footer (8) = bsize + 1
795        let compressed_size = bsize + 1 - 10 - 2 - xlen - 8;
796
797        // Read compressed data into reusable buffer
798        self.compressed_buf.clear();
799        self.compressed_buf.resize(compressed_size, 0);
800        self.inner.read_exact(&mut self.compressed_buf)?;
801
802        // Read footer (CRC32 and uncompressed size)
803        let mut footer = [0u8; 8];
804        self.inner.read_exact(&mut footer)?;
805
806        // Reset decompressor for new block
807        self.decompressor.reset(false);
808
809        // Decompress the block using reusable decompressor
810        // Allocate sufficient space for decompressed output
811        self.current_block.clear();
812        self.current_block.resize(MAX_BLOCK_SIZE, 0);
813
814        // Decompress directly into the buffer
815        let before_out = self.decompressor.total_out();
816
817        self.decompressor
818            .decompress(
819                &self.compressed_buf,
820                &mut self.current_block,
821                FlushDecompress::Finish,
822            )
823            .map_err(|e| Error::InvalidInput(format!("Decompression failed: {}", e)))?;
824
825        // Calculate actual bytes decompressed
826        let bytes_out = (self.decompressor.total_out() - before_out) as usize;
827
828        // Truncate to actual decompressed size
829        self.current_block.truncate(bytes_out);
830
831        // Verify uncompressed size doesn't exceed BGZF limit
832        if self.current_block.len() > MAX_BLOCK_SIZE {
833            return Err(Error::InvalidInput(format!(
834                "BGZF block too large: {} bytes (max {})",
835                self.current_block.len(),
836                MAX_BLOCK_SIZE
837            )));
838        }
839
840        self.block_pos = 0;
841        self.blocks_decompressed += 1;
842        Ok(true)
843    }
844
845    /// Parse bam extra field to extract block size
846    fn parse_bam_extra(&self, extra: &[u8]) -> Result<usize> {
847        let mut pos = 0;
848
849        while pos + 4 <= extra.len() {
850            let si1 = extra[pos];
851            let si2 = extra[pos + 1];
852            let slen = u16::from_le_bytes([extra[pos + 2], extra[pos + 3]]) as usize;
853
854            // Check for bam subfield (SI1='B', SI2='C')
855            if si1 == 66 && si2 == 67 {
856                if pos + 4 + slen > extra.len() {
857                    return Err(Error::InvalidInput("bam extra field truncated".to_string()));
858                }
859
860                // bam subfield should be 2 bytes (block size - 1)
861                if slen != 2 {
862                    return Err(Error::InvalidInput(format!(
863                        "Invalid bam subfield length: {}",
864                        slen
865                    )));
866                }
867
868                let bsize = u16::from_le_bytes([extra[pos + 4], extra[pos + 5]]) as usize;
869                return Ok(bsize);
870            }
871
872            pos += 4 + slen;
873        }
874
875        Err(Error::InvalidInput(
876            "bam subfield not found in extra field".to_string(),
877        ))
878    }
879}
880
881impl<R: Read> Read for BamReader<R> {
882    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
883        if buf.is_empty() {
884            return Ok(0);
885        }
886
887        // If current block is exhausted, read next one
888        if self.block_pos >= self.current_block.len() {
889            match self.read_next_block() {
890                Ok(true) => {}
891                Ok(false) => return Ok(0), // EOF
892                Err(e) => return Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
893            }
894        }
895
896        // Copy data from current block
897        let available = self.current_block.len() - self.block_pos;
898        let to_copy = buf.len().min(available);
899
900        buf[..to_copy]
901            .copy_from_slice(&self.current_block[self.block_pos..self.block_pos + to_copy]);
902        self.block_pos += to_copy;
903
904        Ok(to_copy)
905    }
906}
907
908// Implement GenomicRecordIterator trait (like VCF and FASTQ)
909impl<R: Read> GenomicRecordIterator for BamReader<R> {
910    type Record = BamRecord;
911
912    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
913        // Read record size (4 bytes)
914        let mut size_buf = [0u8; 4];
915        match self.read_exact(&mut size_buf) {
916            Ok(_) => {}
917            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
918                return Ok(None);
919            }
920            Err(e) => return Err(e.into()),
921        }
922
923        let block_size = u32::from_le_bytes(size_buf);
924
925        // Read record data
926        let mut record_data = vec![0u8; block_size as usize + 4]; // +4 for block_size field
927        record_data[0..4].copy_from_slice(&size_buf);
928        self.read_exact(&mut record_data[4..])?;
929        Ok(Some(record_data))
930    }
931
932    fn next_record(&mut self) -> Result<Option<Self::Record>> {
933        // Read record size (4 bytes)
934        let record_data = self.next_raw()?;
935        if record_data.is_none() {
936            return Ok(None);
937        }
938        let record_data = record_data.unwrap();
939        // Parse record
940        self.parse_record(&record_data).map(Some)
941    }
942}
943
944// Implement GenomicReader trait
945impl<R: Read> GenomicReader for BamReader<R> {
946    type Metadata = BamHeader;
947
948    fn metadata(&self) -> &Self::Metadata {
949        self.header
950            .as_ref()
951            .expect("Header not read yet. Call read_header() first.")
952    }
953}
954
955/// Iterator over BAM records
956///
957/// This provides an alternative to using `next_record()` directly.
958/// Created by calling `reader.records()`.
959pub struct BamRecordIterator<'a, R: Read> {
960    reader: &'a mut BamReader<R>,
961}
962
963impl<'a, R: Read> Iterator for BamRecordIterator<'a, R> {
964    type Item = Result<BamRecord>;
965
966    fn next(&mut self) -> Option<Self::Item> {
967        match self.reader.next_record() {
968            Ok(Some(record)) => Some(Ok(record)),
969            Ok(None) => None,
970            Err(e) => Some(Err(e)),
971        }
972    }
973}
974
975#[cfg(test)]
976mod tests {
977    use super::*;
978    use std::io::Cursor;
979
980    #[test]
981    fn test_cigar_op_encoding() {
982        // Test CIGAR operation encoding/decoding
983        let encoded = (10 << 4) | 0; // 10M
984        let op = CigarOp::from_encoded(encoded).unwrap();
985        assert_eq!(op, CigarOp::Match(10));
986        assert_eq!(op.len(), 10);
987        assert!(op.consumes_ref());
988        assert!(op.consumes_query());
989        assert_eq!(op.to_char(), 'M');
990    }
991
992    #[test]
993    fn test_cigar_op_display() {
994        assert_eq!(CigarOp::Match(10).to_string(), "10M");
995        assert_eq!(CigarOp::Ins(5).to_string(), "5I");
996        assert_eq!(CigarOp::Del(3).to_string(), "3D");
997    }
998
999    #[test]
1000    fn test_bam_reader_creation() {
1001        let data = vec![0u8; 1024];
1002        let cursor = Cursor::new(data);
1003        let _reader = BamReader::new(cursor);
1004        // Should not panic
1005    }
1006
1007    #[test]
1008    fn test_bam_reader_invalid_magic() {
1009        // Not a valid BGZF file
1010        let data = vec![0u8; 100];
1011        let cursor = Cursor::new(data);
1012        let mut reader = BamReader::new(cursor);
1013
1014        let mut buf = vec![0u8; 10];
1015        let result = reader.read(&mut buf);
1016
1017        // Should fail because it's not valid BGZF
1018        assert!(result.is_err() || result.unwrap() == 0);
1019    }
1020
1021    #[test]
1022    fn test_parse_bgzf_extra_valid() {
1023        let reader = BamReader::new(Cursor::new(vec![]));
1024
1025        // Valid BGZF extra field: SI1=B(66), SI2=C(67), SLEN=2, BSIZE=100
1026        let extra = vec![66, 67, 2, 0, 100, 0];
1027        let result = reader.parse_bam_extra(&extra);
1028
1029        assert!(result.is_ok());
1030        assert_eq!(result.unwrap(), 100);
1031    }
1032
1033    #[test]
1034    fn test_parse_bgzf_extra_invalid() {
1035        let reader = BamReader::new(Cursor::new(vec![]));
1036
1037        // Invalid - no BGZF signature
1038        let extra = vec![1, 2, 2, 0, 100, 0];
1039        let result = reader.parse_bam_extra(&extra);
1040
1041        assert!(result.is_err());
1042    }
1043
1044    #[test]
1045    fn test_parse_bgzf_extra_truncated() {
1046        let reader = BamReader::new(Cursor::new(vec![]));
1047
1048        // Truncated extra field
1049        let extra = vec![66, 67, 2, 0];
1050        let result = reader.parse_bam_extra(&extra);
1051
1052        assert!(result.is_err());
1053    }
1054
1055    #[test]
1056    fn test_max_block_size() {
1057        assert_eq!(MAX_BLOCK_SIZE, 65536);
1058    }
1059
1060    #[test]
1061    fn test_bam_reader_from_empty() {
1062        let cursor = Cursor::new(vec![]);
1063        let mut reader = BamReader::new(cursor);
1064
1065        let mut buf = vec![0u8; 10];
1066        let result = reader.read(&mut buf);
1067
1068        // Should return 0 for EOF, not error
1069        assert!(result.is_ok());
1070        assert_eq!(result.unwrap(), 0);
1071    }
1072
1073    // TODO: Add integration tests with actual BGZF-compressed data
1074}
genomicframe_core/formats/bam/reader.rs

genomicframe_core/formats/bam/
reader.rs