Skip to main content

fastqc_rust/sequence/
bam.rs

1// BAM/SAM file reader
2// Corresponds to Sequence/BAMFile.java
3
4use std::fs::File;
5use std::io::{self, BufReader, Seek};
6use std::path::Path;
7
8use noodles::bam;
9use noodles::bgzf;
10use noodles::sam;
11use noodles::sam::alignment::record::cigar::op::Kind as CigarKind;
12// Import the Sequence trait so .iter() is available on SAM record sequences
13use noodles::sam::alignment::record::Sequence as _;
14
15use super::{Sequence, SequenceFile};
16use crate::utils::dna::reverse_complement;
17
18// ---------------------------------------------------------------------------
19// Reader abstraction over BAM and SAM formats
20// ---------------------------------------------------------------------------
21
22/// Wrapper enum to unify BAM and SAM record iteration behind a single type.
23///
24/// Java uses HTSJDK's SamReader which auto-detects format.
25/// We dispatch explicitly based on file extension/format config.
26enum InnerReader {
27    Bam(bam::io::Reader<bgzf::Reader<File>>),
28    Sam(sam::io::Reader<BufReader<File>>),
29}
30
31// ---------------------------------------------------------------------------
32// BAMFile
33// ---------------------------------------------------------------------------
34
35/// BAM/SAM file reader that supports both BAM and SAM input formats.
36///
37/// Mirrors `Sequence.BAMFile` in Java. Uses a look-ahead design
38/// where `read_next()` is called at construction time and after each `next()` call.
39pub struct BAMFile {
40    reader: InnerReader,
41    name: String,
42    file_size: u64,
43    only_mapped: bool,
44    /// Cloned file handle for progress tracking via seek position.
45    position_handle: Option<File>,
46
47    /// The next sequence ready to be returned (look-ahead buffer).
48    /// Matches Java's `nextSequence` field.
49    next_sequence: Option<Sequence>,
50
51    /// Whether this is a BAM (true) or SAM (false) file, used for progress estimation.
52    is_bam: bool,
53
54    /// Rough record size for progress estimation.
55    /// Java computes this as `(readLength * 2) + 150`, divided by 4 for BAM.
56    record_size: u64,
57
58    // Reusable record buffers to avoid allocations per record.
59    bam_record: bam::Record,
60    sam_record: sam::Record,
61}
62
63impl BAMFile {
64    /// Open a BAM or SAM file for reading.
65    ///
66    /// If `is_bam` is true, the file is read as BAM format; otherwise as SAM.
67    /// If `only_mapped` is true, unmapped reads are skipped.
68    ///
69    /// The Java constructor opens the file, creates a SamReader with
70    /// SILENT validation stringency, gets an iterator, and calls readNext() to prime
71    /// the look-ahead buffer.
72    pub fn open(path: &Path, is_bam: bool, only_mapped: bool) -> io::Result<Self> {
73        let name = path
74            .file_name()
75            .map(|n| n.to_string_lossy().into_owned())
76            .unwrap_or_else(|| path.to_string_lossy().into_owned());
77
78        let file_size = std::fs::metadata(path)?.len();
79
80        // Clone the file handle before passing to the reader so we can
81        // query the compressed byte position for progress tracking.
82        let file = File::open(path)?;
83        let position_handle = file.try_clone().ok();
84
85        let reader = if is_bam {
86            let mut r = bam::io::Reader::new(file);
87            let _header = r.read_header()?;
88            InnerReader::Bam(r)
89        } else {
90            let buf = BufReader::new(file);
91            let mut r = sam::io::Reader::new(buf);
92            let _header = r.read_header()?;
93            InnerReader::Sam(r)
94        };
95
96        let mut bam_file = BAMFile {
97            reader,
98            name,
99            file_size,
100            only_mapped,
101            position_handle,
102            next_sequence: None,
103            is_bam,
104            record_size: 0,
105            bam_record: bam::Record::default(),
106            sam_record: sam::Record::default(),
107        };
108
109        // Prime the look-ahead buffer by reading the first record.
110        bam_file.read_next()?;
111
112        Ok(bam_file)
113    }
114
115    /// Read the next record and convert it to a Sequence, storing in `self.next_sequence`.
116    ///
117    /// This mirrors `readNext()` in BAMFile.java, including:
118    /// - Skipping unmapped reads when `only_mapped` is true
119    /// - Soft-clip trimming for mapped-only mode
120    /// - Reverse complementing reads on the negative strand
121    fn read_next(&mut self) -> io::Result<()> {
122        loop {
123            // Read the next record from the appropriate reader type
124            let record_data = match &mut self.reader {
125                InnerReader::Bam(reader) => {
126                    match reader.read_record(&mut self.bam_record) {
127                        Ok(0) => None, // EOF
128                        Ok(_) => {
129                            let rec = &self.bam_record;
130                            let flags = rec.flags();
131
132                            // Skip unmapped reads if only_mapped is set.
133                            // Java: `if (onlyMapped && record.getReadUnmappedFlag()) continue;`
134                            if self.only_mapped && flags.is_unmapped() {
135                                continue;
136                            }
137
138                            // Extract fields from BAM record
139                            let name = rec
140                                .name()
141                                .map(|n| String::from_utf8_lossy(n.as_ref()).into_owned())
142                                .unwrap_or_else(|| "*".to_string());
143
144                            let sequence_bases: Vec<u8> = rec.sequence().iter().collect();
145                            let seq_len = sequence_bases.len();
146
147                            // BAM quality scores are raw Phred values (0-93).
148                            // Java's HTSJDK `getBaseQualityString()` converts them to ASCII
149                            // by adding 33 (Sanger/Phred+33 encoding). We do the same.
150                            let quality_bytes: Vec<u8> = rec
151                                .quality_scores()
152                                .as_ref()
153                                .iter()
154                                .map(|&q| q + 33) // Convert Phred+0 to ASCII Phred+33
155                                .collect();
156
157                            // Collect CIGAR ops for soft-clip handling
158                            let cigar_ops: Vec<(CigarKind, usize)> = rec
159                                .cigar()
160                                .iter()
161                                .filter_map(|r| r.ok())
162                                .map(|op| (op.kind(), op.len()))
163                                .collect();
164
165                            let is_reverse = flags.is_reverse_complemented();
166
167                            // Check vendor quality failure flag (0x200)
168                            // for CASAVA-style filtering.
169                            let is_qc_fail = flags.is_qc_fail();
170
171                            // Estimate record size for progress tracking.
172                            // Java: `recordSize = (record.getReadLength()*2)+150`
173                            // then divides by 4 for BAM format.
174                            if self.record_size == 0 && seq_len > 0 {
175                                let mut rs = (seq_len as u64 * 2) + 150;
176                                if self.is_bam {
177                                    rs /= 4; // BAM records are ~4x smaller than SAM
178                                }
179                                self.record_size = rs;
180                            }
181
182                            Some(RecordData {
183                                name,
184                                sequence: sequence_bases,
185                                quality: quality_bytes,
186                                cigar_ops,
187                                is_reverse,
188                                is_qc_fail,
189                            })
190                        }
191                        Err(e) => return Err(e),
192                    }
193                }
194                InnerReader::Sam(reader) => {
195                    match reader.read_record(&mut self.sam_record) {
196                        Ok(0) => None, // EOF
197                        Ok(_) => {
198                            let rec = &self.sam_record;
199                            let flags = rec
200                                .flags()
201                                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
202
203                            // Skip unmapped reads if only_mapped is set.
204                            if self.only_mapped && flags.is_unmapped() {
205                                continue;
206                            }
207
208                            let name = rec
209                                .name()
210                                .map(|n| String::from_utf8_lossy(n.as_ref()).into_owned())
211                                .unwrap_or_else(|| "*".to_string());
212
213                            // SAM sequence is stored as ASCII text
214                            let sequence_bases: Vec<u8> = rec.sequence().iter().collect();
215                            let seq_len = sequence_bases.len();
216
217                            // SAM quality scores from HTSJDK's getBaseQualityString()
218                            // are already in Phred+33 ASCII encoding. The noodles SAM reader
219                            // also stores them as raw ASCII bytes, so we use them directly.
220                            let quality_bytes: Vec<u8> = rec.quality_scores().as_ref().to_vec();
221
222                            // Collect CIGAR ops
223                            let cigar_ops: Vec<(CigarKind, usize)> = rec
224                                .cigar()
225                                .iter()
226                                .filter_map(|r| r.ok())
227                                .map(|op| (op.kind(), op.len()))
228                                .collect();
229
230                            let is_reverse = flags.is_reverse_complemented();
231                            let is_qc_fail = flags.is_qc_fail();
232
233                            if self.record_size == 0 && seq_len > 0 {
234                                let mut rs = (seq_len as u64 * 2) + 150;
235                                if self.is_bam {
236                                    rs /= 4;
237                                }
238                                self.record_size = rs;
239                            }
240
241                            Some(RecordData {
242                                name,
243                                sequence: sequence_bases,
244                                quality: quality_bytes,
245                                cigar_ops,
246                                is_reverse,
247                                is_qc_fail,
248                            })
249                        }
250                        Err(e) => return Err(e),
251                    }
252                }
253            };
254
255            match record_data {
256                None => {
257                    // EOF
258                    self.next_sequence = None;
259                    return Ok(());
260                }
261                Some(data) => {
262                    let mut sequence = data.sequence;
263                    let mut quality = data.quality;
264
265                    // If only working with mapped data, exclude soft-clipped
266                    // regions from the sequence and quality. Java clips the 3' end first
267                    // (so that 5' indices remain correct), then clips the 5' end.
268                    if self.only_mapped && !data.cigar_ops.is_empty() {
269                        // Clip 3' end first (last CIGAR element)
270                        // Java checks `elements.get(elements.size()-1).getOperator().equals(CigarOperator.S)`
271                        if let Some(&(kind, len)) = data.cigar_ops.last() {
272                            if kind == CigarKind::SoftClip {
273                                let new_len = sequence.len().saturating_sub(len);
274                                sequence.truncate(new_len);
275                                quality.truncate(new_len);
276                            }
277                        }
278
279                        // Clip 5' end (first CIGAR element)
280                        // Java checks `elements.get(0).getOperator().equals(CigarOperator.S)`
281                        if let Some(&(kind, len)) = data.cigar_ops.first() {
282                            if kind == CigarKind::SoftClip {
283                                // Java uses `sequence.substring(value)` which
284                                // drops the first `value` characters.
285                                if len <= sequence.len() {
286                                    sequence = sequence[len..].to_vec();
287                                    quality = quality[len..].to_vec();
288                                }
289                            }
290                        }
291                    }
292
293                    // BAM/SAM files always show sequence relative to the top
294                    // strand of the mapped reference. If this read maps to the reverse strand,
295                    // we reverse complement the sequence and reverse the qualities to recover
296                    // the original read orientation. Java does this unconditionally when
297                    // getReadNegativeStrandFlag() is true, regardless of only_mapped.
298                    if data.is_reverse {
299                        sequence = reverse_complement(&sequence);
300                        quality.reverse();
301                    }
302
303                    let mut seq = Sequence::new(data.name, sequence, quality);
304
305                    // Java's BAMFile.java does not explicitly handle CASAVA
306                    // filtering via the read name (that is a FastQ convention). However,
307                    // the SAM flag 0x200 (QC fail / vendor quality check failure) serves
308                    // a similar purpose. We set is_filtered when the QC fail flag is set.
309                    if data.is_qc_fail {
310                        seq.is_filtered = true;
311                    }
312
313                    self.next_sequence = Some(seq);
314                    return Ok(());
315                }
316            }
317        }
318    }
319}
320
321/// Intermediate struct holding extracted record fields, used to avoid
322/// borrow-checker issues with the reader and record.
323struct RecordData {
324    name: String,
325    sequence: Vec<u8>,
326    quality: Vec<u8>,
327    cigar_ops: Vec<(CigarKind, usize)>,
328    is_reverse: bool,
329    is_qc_fail: bool,
330}
331
332impl SequenceFile for BAMFile {
333    fn next(&mut self) -> Option<io::Result<Sequence>> {
334        // Java's `next()` returns the current `nextSequence` then calls
335        // `readNext()` to prime the next one. Same pattern as FastQFile.
336        let current = self.next_sequence.take()?;
337        if let Err(e) = self.read_next() {
338            return Some(Err(e));
339        }
340        Some(Ok(current))
341    }
342
343    fn name(&self) -> &str {
344        &self.name
345    }
346
347    /// BAM/SAM files are never colorspace.
348    /// Java's `BAMFile.isColorspace()` always returns false.
349    fn is_colorspace(&self) -> bool {
350        false
351    }
352
353    /// Java tracks progress using the raw FileInputStream position
354    /// divided by file size. For BAM files, this gives a rough estimate since
355    /// the underlying file is BGZF compressed. We estimate using the record size
356    /// heuristic from Java when precise position tracking is not available.
357    fn percent_complete(&self) -> f64 {
358        if self.next_sequence.is_none() {
359            return 100.0;
360        }
361        // Track progress via compressed byte position, same as FASTQ reader.
362        if let Some(ref handle) = self.position_handle {
363            if let Ok(mut h) = handle.try_clone() {
364                if let Ok(pos) = h.stream_position() {
365                    return (pos as f64 / self.file_size as f64) * 100.0;
366                }
367            }
368        }
369        0.0
370    }
371}
372
373// ---------------------------------------------------------------------------
374// Format detection and factory function
375// ---------------------------------------------------------------------------
376
377/// Detect the file format from the config or file extension, and open the appropriate reader.
378///
379/// This mirrors the format detection logic in `SequenceFactory.java`:
380/// - If `config.sequence_format` is set, use that explicitly
381/// - Otherwise, detect by file extension
382///
383/// Returns a boxed `SequenceFile` trait object.
384pub fn open_sequence_file(
385    config: &crate::config::FastQCConfig,
386    path: &Path,
387) -> io::Result<Box<dyn SequenceFile>> {
388    use super::fastq::FastQFile;
389
390    // Java checks FastQCConfig.getInstance().getSequenceFormat() first.
391    // If set, it overrides extension-based detection.
392    if let Some(ref format) = config.sequence_format {
393        let format_lower = format.to_lowercase();
394        match format_lower.as_str() {
395            // "bam" format reads all records (mapped + unmapped)
396            "bam" => {
397                return Ok(Box::new(BAMFile::open(path, true, false)?));
398            }
399            // "sam" format reads all records
400            "sam" => {
401                return Ok(Box::new(BAMFile::open(path, false, false)?));
402            }
403            // "bam_mapped" reads only mapped records from BAM
404            "bam_mapped" => {
405                return Ok(Box::new(BAMFile::open(path, true, true)?));
406            }
407            // "sam_mapped" reads only mapped records from SAM
408            "sam_mapped" => {
409                return Ok(Box::new(BAMFile::open(path, false, true)?));
410            }
411            // "fastq" or "fastq.gz" or similar -> FastQFile
412            "fastq" => {
413                return Ok(Box::new(FastQFile::open(config, path)?));
414            }
415            other => {
416                return Err(io::Error::new(
417                    io::ErrorKind::InvalidInput,
418                    format!("Unknown sequence format: {}", other),
419                ));
420            }
421        }
422    }
423
424    // Extension-based detection from SequenceFactory.java
425    let name_lower = path
426        .file_name()
427        .map(|n| n.to_string_lossy().to_lowercase())
428        .unwrap_or_default();
429
430    if name_lower.ends_with(".bam") || name_lower.ends_with(".ubam") {
431        // .bam and .ubam files are opened as BAM with all reads
432        Ok(Box::new(BAMFile::open(path, true, false)?))
433    } else if name_lower.ends_with(".sam") {
434        // .sam files are opened as SAM with all reads
435        Ok(Box::new(BAMFile::open(path, false, false)?))
436    } else if name_lower.ends_with(".fast5") {
437        // Open Fast5 (Nanopore HDF5) files via pure-Rust hdf5-pure crate
438        Ok(Box::new(super::fast5::Fast5File::open(path)?))
439    } else {
440        // Everything else is assumed to be FASTQ
441        Ok(Box::new(FastQFile::open(config, path)?))
442    }
443}
444
445// ---------------------------------------------------------------------------
446// Tests
447// ---------------------------------------------------------------------------
448
449#[cfg(test)]
450mod tests {
451    use super::*;
452
453    #[test]
454    fn test_format_detection_fast5() {
455        // Fast5 files are now supported but still fail if the file doesn't exist
456        let config = crate::config::FastQCConfig::default();
457        let result = open_sequence_file(&config, Path::new("nonexistent.fast5"));
458        assert!(result.is_err());
459    }
460
461    #[test]
462    fn test_format_detection_unknown_format() {
463        let config = crate::config::FastQCConfig {
464            sequence_format: Some("unknown_format".to_string()),
465            ..Default::default()
466        };
467        let result = open_sequence_file(&config, Path::new("test.dat"));
468        assert!(result.is_err());
469    }
470}