Skip to main content

genomicframe_core/formats/sam/
reader.rs

1//! SAM (Sequence Alignment/Map) format reader
2//!
3//! SAM files are tab-delimited text files containing alignment information.
4//! Each line represents one alignment record with 11 mandatory fields plus optional tags.
5//!
6//! # Design Philosophy
7//!
8//! - **Streaming by default**: Records are never buffered unless explicitly requested
9//! - **O(1) memory**: Process multi-GB files with constant memory
10//! - **Lazy evaluation**: Parse alignments on-demand as you iterate
11//! - **Text-based**: Human readable (unlike BAM's binary format)
12//!
13//! # SAM Format Specification
14//!
15//! Mandatory fields (tab-separated):
16//! 1. QNAME: Query template name
17//! 2. FLAG: Bitwise flags
18//! 3. RNAME: Reference sequence name
19//! 4. POS: 1-based leftmost mapping position
20//! 5. MAPQ: Mapping quality
21//! 6. CIGAR: CIGAR string
22//! 7. RNEXT: Reference name of mate/next read
23//! 8. PNEXT: Position of mate/next read
24//! 9. TLEN: Template length
25//! 10. SEQ: Segment sequence
26//! 11. QUAL: ASCII Phred quality scores
27//! 12+: Optional tags (TAG:TYPE:VALUE)
28//!
29//! # Examples
30//!
31//! ```no_run
32//! use genomicframe_core::formats::sam::SamReader;
33//! use genomicframe_core::core::GenomicRecordIterator;
34//!
35//! // Streaming: O(1) memory, processes one alignment at a time
36//! let mut reader = SamReader::from_path("alignments.sam")?;
37//!
38//! // Read the header first (required)
39//! let header = reader.read_header()?;
40//! println!("References: {}", header.references.len());
41//!
42//! // Iterate through alignments
43//! while let Some(record) = reader.next_record()? {
44//!     if !record.is_unmapped() && record.mapq >= 30 {
45//!         println!("{}: {} at {}", record.qname, record.cigar_string(), record.pos);
46//!     }
47//! }
48//! # Ok::<(), genomicframe_core::error::Error>(())
49//! ```
50
51use crate::core::{GenomicReader, GenomicRecordIterator};
52use crate::error::{Error, Result};
53use crate::formats::bam::{CigarOp, RefSeq, TagValue};
54use crate::io::Compression;
55use flate2::read::MultiGzDecoder;
56use std::collections::HashMap;
57use std::fs::File;
58use std::io::{BufRead, BufReader};
59use std::path::Path;
60
61
62/// SAM file header
63#[derive(Debug, Clone)]
64pub struct SamHeader {
65    /// Header lines (e.g., @HD, @SQ, @RG, @PG, @CO)
66    pub lines: Vec<String>,
67    /// Reference sequences (from @SQ lines)
68    pub references: Vec<RefSeq>,
69    /// Reference name to index mapping (for fast lookup)
70    ref_index: HashMap<String, usize>,
71}
72
73impl SamHeader {
74    /// Create a new empty SAM header
75    pub fn new() -> Self {
76        Self {
77            lines: Vec::new(),
78            references: Vec::new(),
79            ref_index: HashMap::new(),
80        }
81    }
82
83    /// Get reference index by name
84    pub fn ref_index(&self, name: &str) -> Option<usize> {
85        self.ref_index.get(name).copied()
86    }
87
88    /// Get reference name by index
89    pub fn ref_name(&self, idx: usize) -> Option<&str> {
90        self.references.get(idx).map(|r| r.name.as_str())
91    }
92}
93
94impl Default for SamHeader {
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100/// SAM alignment record
101///
102/// This struct is identical to BAM's BamRecord but parsed from text instead of binary.
103#[derive(Debug, Clone)]
104pub struct SamRecord {
105    /// Query template name
106    pub qname: String,
107    /// Bitwise FLAG
108    pub flag: u16,
109    /// Reference sequence name
110    pub rname: String,
111    /// 1-based leftmost mapping position (0 for unmapped)
112    pub pos: i32,
113    /// Mapping quality (255 if unavailable)
114    pub mapq: u8,
115    /// CIGAR string
116    pub cigar: Vec<CigarOp>,
117    /// Reference name of mate/next read
118    pub rnext: String,
119    /// Position of mate/next read (0 if unavailable)
120    pub pnext: i32,
121    /// Template length
122    pub tlen: i32,
123    /// Segment sequence (empty if unavailable)
124    pub seq: Vec<u8>,
125    /// Phred-scaled base qualities (empty if unavailable)
126    pub qual: Vec<u8>,
127    /// Optional tags
128    pub tags: HashMap<String, TagValue>,
129}
130
131impl SamRecord {
132    /// Check if read is paired
133    pub fn is_paired(&self) -> bool {
134        (self.flag & 0x1) != 0
135    }
136
137    /// Check if read is mapped in proper pair
138    pub fn is_proper_pair(&self) -> bool {
139        (self.flag & 0x2) != 0
140    }
141
142    /// Check if read is unmapped
143    pub fn is_unmapped(&self) -> bool {
144        (self.flag & 0x4) != 0
145    }
146
147    /// Check if mate is unmapped
148    pub fn is_mate_unmapped(&self) -> bool {
149        (self.flag & 0x8) != 0
150    }
151
152    /// Check if read is reverse strand
153    pub fn is_reverse(&self) -> bool {
154        (self.flag & 0x10) != 0
155    }
156
157    /// Check if mate is reverse strand
158    pub fn is_mate_reverse(&self) -> bool {
159        (self.flag & 0x20) != 0
160    }
161
162    /// Check if read is first in pair
163    pub fn is_first_in_pair(&self) -> bool {
164        (self.flag & 0x40) != 0
165    }
166
167    /// Check if read is second in pair
168    pub fn is_second_in_pair(&self) -> bool {
169        (self.flag & 0x80) != 0
170    }
171
172    /// Check if alignment is secondary
173    pub fn is_secondary(&self) -> bool {
174        (self.flag & 0x100) != 0
175    }
176
177    /// Check if read fails quality checks
178    pub fn is_qc_fail(&self) -> bool {
179        (self.flag & 0x200) != 0
180    }
181
182    /// Check if read is a PCR or optical duplicate
183    pub fn is_duplicate(&self) -> bool {
184        (self.flag & 0x400) != 0
185    }
186
187    /// Check if alignment is supplementary
188    pub fn is_supplementary(&self) -> bool {
189        (self.flag & 0x800) != 0
190    }
191
192    /// Get CIGAR string representation
193    pub fn cigar_string(&self) -> String {
194        if self.cigar.is_empty() {
195            "*".to_string()
196        } else {
197            self.cigar.iter().map(|op| op.to_string()).collect()
198        }
199    }
200
201    /// Get sequence as string
202    pub fn seq_string(&self) -> String {
203        if self.seq.is_empty() {
204            "*".to_string()
205        } else {
206            String::from_utf8_lossy(&self.seq).to_string()
207        }
208    }
209}
210
211/// SAM file reader (streaming, memory-efficient)
212pub struct SamReader {
213    reader: Box<dyn BufRead>,
214    header: Option<SamHeader>,
215    line_buffer: String,
216    line_number: usize,
217}
218
219impl SamReader {
220    /// Open a SAM file from a path - handles .sam and .sam.gz automatically
221    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
222        let path = path.as_ref();
223        let file = File::open(path)?;
224        let compression = Compression::from_path(path);
225
226        let reader: Box<dyn BufRead> = match compression {
227            Compression::Gzip => Box::new(BufReader::new(MultiGzDecoder::new(file))),
228            _ => Box::new(BufReader::new(file)),
229        };
230
231        Ok(Self {
232            reader,
233            header: None,
234            line_buffer: String::with_capacity(512),
235            line_number: 0,
236        })
237    }
238
239    /// Create a new SAM reader from any BufRead source
240    pub fn new<R: BufRead + 'static>(reader: R) -> Self {
241        Self {
242            reader: Box::new(reader),
243            header: None,
244            line_buffer: String::with_capacity(512),
245            line_number: 0,
246        }
247    }
248
249    /// Read and parse the SAM header
250    ///
251    /// This must be called before reading records.
252    /// Header lines start with '@', record lines don't.
253    pub fn read_header(&mut self) -> Result<&SamHeader> {
254        if self.header.is_some() {
255            return Ok(self.header.as_ref().unwrap());
256        }
257
258        let mut header = SamHeader::new();
259
260        loop {
261            self.line_buffer.clear();
262            let bytes = self.reader.read_line(&mut self.line_buffer)?;
263            if bytes == 0 {
264                // Empty file
265                break;
266            }
267
268            self.line_number += 1;
269
270            let line = self.line_buffer.trim();
271
272            if line.is_empty() {
273                continue;
274            }
275
276            if !line.starts_with('@') {
277                // First non-header line - put it back by not consuming
278                // We'll re-read it in next_record()
279                self.line_number -= 1;
280                break;
281            }
282
283            header.lines.push(line.to_string());
284
285            // Parse @SQ (reference sequence) lines
286            if line.starts_with("@SQ") {
287                let mut name = None;
288                let mut length = None;
289
290                // Parse tab-separated fields
291                for field in line.split('\t').skip(1) {
292                    if let Some((key, value)) = field.split_once(':') {
293                        match key {
294                            "SN" => name = Some(value.to_string()),
295                            "LN" => length = value.parse::<u32>().ok(),
296                            _ => {}
297                        }
298                    }
299                }
300
301                if let (Some(name), Some(length)) = (name, length) {
302                    let idx = header.references.len();
303                    header.ref_index.insert(name.clone(), idx);
304                    header.references.push(RefSeq { name, length });
305                }
306            }
307        }
308
309        self.header = Some(header);
310        Ok(self.header.as_ref().unwrap())
311    }
312
313    /// Get the header (must call read_header first)
314    pub fn header(&self) -> Option<&SamHeader> {
315        self.header.as_ref()
316    }
317
318    /// Parse CIGAR string into operations (delegates to shared implementation)
319    fn parse_cigar(cigar_str: &str) -> Result<Vec<CigarOp>> {
320        CigarOp::parse_cigar_string(cigar_str)
321    }
322
323    /// Parse a SAM record line
324    fn parse_record(&mut self, line: &str) -> Result<SamRecord> {
325        let fields: Vec<&str> = line.split('\t').collect();
326
327        if fields.len() < 11 {
328            return Err(Error::Parse(format!(
329                "Line {}: SAM record must have at least 11 fields, got {}",
330                self.line_number,
331                fields.len()
332            )));
333        }
334
335        // Parse mandatory fields
336        let qname = fields[0].to_string();
337        let flag = fields[1].parse::<u16>().map_err(|_| {
338            Error::Parse(format!("Line {}: Invalid FLAG: {}", self.line_number, fields[1]))
339        })?;
340        let rname = fields[2].to_string();
341        let pos = fields[3].parse::<i32>().map_err(|_| {
342            Error::Parse(format!("Line {}: Invalid POS: {}", self.line_number, fields[3]))
343        })?;
344        let mapq = fields[4].parse::<u8>().map_err(|_| {
345            Error::Parse(format!("Line {}: Invalid MAPQ: {}", self.line_number, fields[4]))
346        })?;
347        let cigar = Self::parse_cigar(fields[5])?;
348        let rnext = fields[6].to_string();
349        let pnext = fields[7].parse::<i32>().map_err(|_| {
350            Error::Parse(format!("Line {}: Invalid PNEXT: {}", self.line_number, fields[7]))
351        })?;
352        let tlen = fields[8].parse::<i32>().map_err(|_| {
353            Error::Parse(format!("Line {}: Invalid TLEN: {}", self.line_number, fields[8]))
354        })?;
355
356        // Sequence
357        let seq = if fields[9] == "*" {
358            Vec::new()
359        } else {
360            fields[9].as_bytes().to_vec()
361        };
362
363        // Quality
364        let qual = if fields[10] == "*" {
365            Vec::new()
366        } else {
367            // Convert ASCII to Phred+33 scores
368            fields[10].bytes().map(|b| b.saturating_sub(33)).collect()
369        };
370
371        // Parse optional tags
372        let mut tags = HashMap::new();
373        for field in fields.iter().skip(11) {
374            if let Some((tag, rest)) = field.split_once(':') {
375                if let Some((type_str, value)) = rest.split_once(':') {
376                    let tag_value = match type_str {
377                        "A" => {
378                            // Single character
379                            TagValue::Char(value.bytes().next().unwrap_or(0))
380                        }
381                        "i" => {
382                            // Integer
383                            TagValue::Int(value.parse().unwrap_or(0))
384                        }
385                        "f" => {
386                            // Float
387                            TagValue::Float(value.parse().unwrap_or(0.0))
388                        }
389                        "Z" => {
390                            // String
391                            TagValue::String(value.to_string())
392                        }
393                        "H" => {
394                            // Hex string
395                            TagValue::String(value.to_string())
396                        }
397                        "B" => {
398                            // Array (skip complex parsing for now)
399                            TagValue::ByteArray(value.as_bytes().to_vec())
400                        }
401                        _ => continue,
402                    };
403
404                    tags.insert(tag.to_string(), tag_value);
405                }
406            }
407        }
408
409        Ok(SamRecord {
410            qname,
411            flag,
412            rname,
413            pos,
414            mapq,
415            cigar,
416            rnext,
417            pnext,
418            tlen,
419            seq,
420            qual,
421            tags,
422        })
423    }
424
425    /// Compute statistics from this reader
426    pub fn compute_stats(
427        mut self,
428        _config: Option<crate::parallel::ParallelConfig>,
429    ) -> Result<super::stats::SamStats> {
430        // Ensure header is read
431        self.read_header()?;
432
433        // For now, SAM uses sequential processing (parallel SAM is complex due to text format)
434        super::stats::SamStats::compute(&mut self)
435    }
436}
437
438impl GenomicRecordIterator for SamReader {
439    type Record = SamRecord;
440
441    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
442        self.line_buffer.clear();
443        let bytes = self.reader.read_line(&mut self.line_buffer)?;
444
445        if bytes == 0 {
446            return Ok(None);
447        }
448
449        self.line_number += 1;
450
451        let line = self.line_buffer.trim();
452
453        // Skip empty lines
454        if line.is_empty() {
455            return self.next_raw();
456        }
457
458        // Skip header lines (shouldn't happen after read_header, but be safe)
459        if line.starts_with('@') {
460            return self.next_raw();
461        }
462
463        Ok(Some(line.as_bytes().to_vec()))
464    }
465
466    fn next_record(&mut self) -> Result<Option<Self::Record>> {
467        self.line_buffer.clear();
468        let bytes = self.reader.read_line(&mut self.line_buffer)?;
469
470        if bytes == 0 {
471            return Ok(None);
472        }
473
474        self.line_number += 1;
475
476        let line = self.line_buffer.trim().to_string(); // Clone to avoid borrow issue
477
478        // Skip empty lines
479        if line.is_empty() {
480            return self.next_record();
481        }
482
483        // Skip header lines (shouldn't happen after read_header, but be safe)
484        if line.starts_with('@') {
485            return self.next_record();
486        }
487
488        Ok(Some(self.parse_record(&line)?))
489    }
490}
491
492impl GenomicReader for SamReader {
493    type Metadata = SamHeader;
494
495    fn metadata(&self) -> &Self::Metadata {
496        self.header
497            .as_ref()
498            .expect("Header not read yet. Call read_header() first.")
499    }
500}
501
502#[cfg(test)]
503mod tests {
504    use super::*;
505
506    #[test]
507    fn test_cigar_parsing() {
508        let cigar = SamReader::parse_cigar("10M2I5D").unwrap();
509        assert_eq!(cigar.len(), 3);
510        assert_eq!(cigar[0], CigarOp::Match(10));
511        assert_eq!(cigar[1], CigarOp::Ins(2));
512        assert_eq!(cigar[2], CigarOp::Del(5));
513    }
514
515    #[test]
516    fn test_cigar_star() {
517        let cigar = SamReader::parse_cigar("*").unwrap();
518        assert!(cigar.is_empty());
519    }
520
521    #[test]
522    fn test_sam_record_flags() {
523        let record = SamRecord {
524            qname: "read1".to_string(),
525            flag: 0x1 | 0x2, // Paired and proper pair
526            rname: "chr1".to_string(),
527            pos: 100,
528            mapq: 60,
529            cigar: vec![CigarOp::Match(100)],
530            rnext: "=".to_string(),
531            pnext: 200,
532            tlen: 150,
533            seq: b"ACGT".to_vec(),
534            qual: vec![30, 30, 30, 30],
535            tags: HashMap::new(),
536        };
537
538        assert!(record.is_paired());
539        assert!(record.is_proper_pair());
540        assert!(!record.is_unmapped());
541        assert!(!record.is_duplicate());
542    }
543}