fastqc_rust/sequence/bam.rs
1// BAM/SAM file reader
2// Corresponds to Sequence/BAMFile.java
3
4use std::fs::File;
5use std::io::{self, BufReader, Seek};
6use std::path::Path;
7
8use noodles::bam;
9use noodles::bgzf;
10use noodles::sam;
11use noodles::sam::alignment::record::cigar::op::Kind as CigarKind;
12// Import the Sequence trait so .iter() is available on SAM record sequences
13use noodles::sam::alignment::record::Sequence as _;
14
15use super::{Sequence, SequenceFile};
16use crate::utils::dna::reverse_complement;
17
18// ---------------------------------------------------------------------------
19// Reader abstraction over BAM and SAM formats
20// ---------------------------------------------------------------------------
21
22/// Wrapper enum to unify BAM and SAM record iteration behind a single type.
23///
24/// Java uses HTSJDK's SamReader which auto-detects format.
25/// We dispatch explicitly based on file extension/format config.
26enum InnerReader {
27 Bam(bam::io::Reader<bgzf::Reader<File>>),
28 Sam(sam::io::Reader<BufReader<File>>),
29}
30
31// ---------------------------------------------------------------------------
32// BAMFile
33// ---------------------------------------------------------------------------
34
35/// BAM/SAM file reader that supports both BAM and SAM input formats.
36///
37/// Mirrors `Sequence.BAMFile` in Java. Uses a look-ahead design
38/// where `read_next()` is called at construction time and after each `next()` call.
39pub struct BAMFile {
40 reader: InnerReader,
41 name: String,
42 file_size: u64,
43 only_mapped: bool,
44 /// Cloned file handle for progress tracking via seek position.
45 position_handle: Option<File>,
46
47 /// The next sequence ready to be returned (look-ahead buffer).
48 /// Matches Java's `nextSequence` field.
49 next_sequence: Option<Sequence>,
50
51 /// Whether this is a BAM (true) or SAM (false) file, used for progress estimation.
52 is_bam: bool,
53
54 /// Rough record size for progress estimation.
55 /// Java computes this as `(readLength * 2) + 150`, divided by 4 for BAM.
56 record_size: u64,
57
58 // Reusable record buffers to avoid allocations per record.
59 bam_record: bam::Record,
60 sam_record: sam::Record,
61}
62
63impl BAMFile {
64 /// Open a BAM or SAM file for reading.
65 ///
66 /// If `is_bam` is true, the file is read as BAM format; otherwise as SAM.
67 /// If `only_mapped` is true, unmapped reads are skipped.
68 ///
69 /// The Java constructor opens the file, creates a SamReader with
70 /// SILENT validation stringency, gets an iterator, and calls readNext() to prime
71 /// the look-ahead buffer.
72 pub fn open(path: &Path, is_bam: bool, only_mapped: bool) -> io::Result<Self> {
73 let name = path
74 .file_name()
75 .map(|n| n.to_string_lossy().into_owned())
76 .unwrap_or_else(|| path.to_string_lossy().into_owned());
77
78 let file_size = std::fs::metadata(path)?.len();
79
80 // Clone the file handle before passing to the reader so we can
81 // query the compressed byte position for progress tracking.
82 let file = File::open(path)?;
83 let position_handle = file.try_clone().ok();
84
85 let reader = if is_bam {
86 let mut r = bam::io::Reader::new(file);
87 let _header = r.read_header()?;
88 InnerReader::Bam(r)
89 } else {
90 let buf = BufReader::new(file);
91 let mut r = sam::io::Reader::new(buf);
92 let _header = r.read_header()?;
93 InnerReader::Sam(r)
94 };
95
96 let mut bam_file = BAMFile {
97 reader,
98 name,
99 file_size,
100 only_mapped,
101 position_handle,
102 next_sequence: None,
103 is_bam,
104 record_size: 0,
105 bam_record: bam::Record::default(),
106 sam_record: sam::Record::default(),
107 };
108
109 // Prime the look-ahead buffer by reading the first record.
110 bam_file.read_next()?;
111
112 Ok(bam_file)
113 }
114
115 /// Read the next record and convert it to a Sequence, storing in `self.next_sequence`.
116 ///
117 /// This mirrors `readNext()` in BAMFile.java, including:
118 /// - Skipping unmapped reads when `only_mapped` is true
119 /// - Soft-clip trimming for mapped-only mode
120 /// - Reverse complementing reads on the negative strand
121 fn read_next(&mut self) -> io::Result<()> {
122 loop {
123 // Read the next record from the appropriate reader type
124 let record_data = match &mut self.reader {
125 InnerReader::Bam(reader) => {
126 match reader.read_record(&mut self.bam_record) {
127 Ok(0) => None, // EOF
128 Ok(_) => {
129 let rec = &self.bam_record;
130 let flags = rec.flags();
131
132 // Skip unmapped reads if only_mapped is set.
133 // Java: `if (onlyMapped && record.getReadUnmappedFlag()) continue;`
134 if self.only_mapped && flags.is_unmapped() {
135 continue;
136 }
137
138 // Extract fields from BAM record
139 let name = rec
140 .name()
141 .map(|n| String::from_utf8_lossy(n.as_ref()).into_owned())
142 .unwrap_or_else(|| "*".to_string());
143
144 let sequence_bases: Vec<u8> = rec.sequence().iter().collect();
145 let seq_len = sequence_bases.len();
146
147 // BAM quality scores are raw Phred values (0-93).
148 // Java's HTSJDK `getBaseQualityString()` converts them to ASCII
149 // by adding 33 (Sanger/Phred+33 encoding). We do the same.
150 let quality_bytes: Vec<u8> = rec
151 .quality_scores()
152 .as_ref()
153 .iter()
154 .map(|&q| q + 33) // Convert Phred+0 to ASCII Phred+33
155 .collect();
156
157 // Collect CIGAR ops for soft-clip handling
158 let cigar_ops: Vec<(CigarKind, usize)> = rec
159 .cigar()
160 .iter()
161 .filter_map(|r| r.ok())
162 .map(|op| (op.kind(), op.len()))
163 .collect();
164
165 let is_reverse = flags.is_reverse_complemented();
166
167 // Check vendor quality failure flag (0x200)
168 // for CASAVA-style filtering.
169 let is_qc_fail = flags.is_qc_fail();
170
171 // Estimate record size for progress tracking.
172 // Java: `recordSize = (record.getReadLength()*2)+150`
173 // then divides by 4 for BAM format.
174 if self.record_size == 0 && seq_len > 0 {
175 let mut rs = (seq_len as u64 * 2) + 150;
176 if self.is_bam {
177 rs /= 4; // BAM records are ~4x smaller than SAM
178 }
179 self.record_size = rs;
180 }
181
182 Some(RecordData {
183 name,
184 sequence: sequence_bases,
185 quality: quality_bytes,
186 cigar_ops,
187 is_reverse,
188 is_qc_fail,
189 })
190 }
191 Err(e) => return Err(e),
192 }
193 }
194 InnerReader::Sam(reader) => {
195 match reader.read_record(&mut self.sam_record) {
196 Ok(0) => None, // EOF
197 Ok(_) => {
198 let rec = &self.sam_record;
199 let flags = rec
200 .flags()
201 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
202
203 // Skip unmapped reads if only_mapped is set.
204 if self.only_mapped && flags.is_unmapped() {
205 continue;
206 }
207
208 let name = rec
209 .name()
210 .map(|n| String::from_utf8_lossy(n.as_ref()).into_owned())
211 .unwrap_or_else(|| "*".to_string());
212
213 // SAM sequence is stored as ASCII text
214 let sequence_bases: Vec<u8> = rec.sequence().iter().collect();
215 let seq_len = sequence_bases.len();
216
217 // SAM quality scores from HTSJDK's getBaseQualityString()
218 // are already in Phred+33 ASCII encoding. The noodles SAM reader
219 // also stores them as raw ASCII bytes, so we use them directly.
220 let quality_bytes: Vec<u8> = rec.quality_scores().as_ref().to_vec();
221
222 // Collect CIGAR ops
223 let cigar_ops: Vec<(CigarKind, usize)> = rec
224 .cigar()
225 .iter()
226 .filter_map(|r| r.ok())
227 .map(|op| (op.kind(), op.len()))
228 .collect();
229
230 let is_reverse = flags.is_reverse_complemented();
231 let is_qc_fail = flags.is_qc_fail();
232
233 if self.record_size == 0 && seq_len > 0 {
234 let mut rs = (seq_len as u64 * 2) + 150;
235 if self.is_bam {
236 rs /= 4;
237 }
238 self.record_size = rs;
239 }
240
241 Some(RecordData {
242 name,
243 sequence: sequence_bases,
244 quality: quality_bytes,
245 cigar_ops,
246 is_reverse,
247 is_qc_fail,
248 })
249 }
250 Err(e) => return Err(e),
251 }
252 }
253 };
254
255 match record_data {
256 None => {
257 // EOF
258 self.next_sequence = None;
259 return Ok(());
260 }
261 Some(data) => {
262 let mut sequence = data.sequence;
263 let mut quality = data.quality;
264
265 // If only working with mapped data, exclude soft-clipped
266 // regions from the sequence and quality. Java clips the 3' end first
267 // (so that 5' indices remain correct), then clips the 5' end.
268 if self.only_mapped && !data.cigar_ops.is_empty() {
269 // Clip 3' end first (last CIGAR element)
270 // Java checks `elements.get(elements.size()-1).getOperator().equals(CigarOperator.S)`
271 if let Some(&(kind, len)) = data.cigar_ops.last() {
272 if kind == CigarKind::SoftClip {
273 let new_len = sequence.len().saturating_sub(len);
274 sequence.truncate(new_len);
275 quality.truncate(new_len);
276 }
277 }
278
279 // Clip 5' end (first CIGAR element)
280 // Java checks `elements.get(0).getOperator().equals(CigarOperator.S)`
281 if let Some(&(kind, len)) = data.cigar_ops.first() {
282 if kind == CigarKind::SoftClip {
283 // Java uses `sequence.substring(value)` which
284 // drops the first `value` characters.
285 if len <= sequence.len() {
286 sequence = sequence[len..].to_vec();
287 quality = quality[len..].to_vec();
288 }
289 }
290 }
291 }
292
293 // BAM/SAM files always show sequence relative to the top
294 // strand of the mapped reference. If this read maps to the reverse strand,
295 // we reverse complement the sequence and reverse the qualities to recover
296 // the original read orientation. Java does this unconditionally when
297 // getReadNegativeStrandFlag() is true, regardless of only_mapped.
298 if data.is_reverse {
299 sequence = reverse_complement(&sequence);
300 quality.reverse();
301 }
302
303 let mut seq = Sequence::new(data.name, sequence, quality);
304
305 // Java's BAMFile.java does not explicitly handle CASAVA
306 // filtering via the read name (that is a FastQ convention). However,
307 // the SAM flag 0x200 (QC fail / vendor quality check failure) serves
308 // a similar purpose. We set is_filtered when the QC fail flag is set.
309 if data.is_qc_fail {
310 seq.is_filtered = true;
311 }
312
313 self.next_sequence = Some(seq);
314 return Ok(());
315 }
316 }
317 }
318 }
319}
320
321/// Intermediate struct holding extracted record fields, used to avoid
322/// borrow-checker issues with the reader and record.
323struct RecordData {
324 name: String,
325 sequence: Vec<u8>,
326 quality: Vec<u8>,
327 cigar_ops: Vec<(CigarKind, usize)>,
328 is_reverse: bool,
329 is_qc_fail: bool,
330}
331
332impl SequenceFile for BAMFile {
333 fn next(&mut self) -> Option<io::Result<Sequence>> {
334 // Java's `next()` returns the current `nextSequence` then calls
335 // `readNext()` to prime the next one. Same pattern as FastQFile.
336 let current = self.next_sequence.take()?;
337 if let Err(e) = self.read_next() {
338 return Some(Err(e));
339 }
340 Some(Ok(current))
341 }
342
343 fn name(&self) -> &str {
344 &self.name
345 }
346
347 /// BAM/SAM files are never colorspace.
348 /// Java's `BAMFile.isColorspace()` always returns false.
349 fn is_colorspace(&self) -> bool {
350 false
351 }
352
353 /// Java tracks progress using the raw FileInputStream position
354 /// divided by file size. For BAM files, this gives a rough estimate since
355 /// the underlying file is BGZF compressed. We estimate using the record size
356 /// heuristic from Java when precise position tracking is not available.
357 fn percent_complete(&self) -> f64 {
358 if self.next_sequence.is_none() {
359 return 100.0;
360 }
361 // Track progress via compressed byte position, same as FASTQ reader.
362 if let Some(ref handle) = self.position_handle {
363 if let Ok(mut h) = handle.try_clone() {
364 if let Ok(pos) = h.stream_position() {
365 return (pos as f64 / self.file_size as f64) * 100.0;
366 }
367 }
368 }
369 0.0
370 }
371}
372
373// ---------------------------------------------------------------------------
374// Format detection and factory function
375// ---------------------------------------------------------------------------
376
377/// Detect the file format from the config or file extension, and open the appropriate reader.
378///
379/// This mirrors the format detection logic in `SequenceFactory.java`:
380/// - If `config.sequence_format` is set, use that explicitly
381/// - Otherwise, detect by file extension
382///
383/// Returns a boxed `SequenceFile` trait object.
384pub fn open_sequence_file(
385 config: &crate::config::FastQCConfig,
386 path: &Path,
387) -> io::Result<Box<dyn SequenceFile>> {
388 use super::fastq::FastQFile;
389
390 // Java checks FastQCConfig.getInstance().getSequenceFormat() first.
391 // If set, it overrides extension-based detection.
392 if let Some(ref format) = config.sequence_format {
393 let format_lower = format.to_lowercase();
394 match format_lower.as_str() {
395 // "bam" format reads all records (mapped + unmapped)
396 "bam" => {
397 return Ok(Box::new(BAMFile::open(path, true, false)?));
398 }
399 // "sam" format reads all records
400 "sam" => {
401 return Ok(Box::new(BAMFile::open(path, false, false)?));
402 }
403 // "bam_mapped" reads only mapped records from BAM
404 "bam_mapped" => {
405 return Ok(Box::new(BAMFile::open(path, true, true)?));
406 }
407 // "sam_mapped" reads only mapped records from SAM
408 "sam_mapped" => {
409 return Ok(Box::new(BAMFile::open(path, false, true)?));
410 }
411 // "fastq" or "fastq.gz" or similar -> FastQFile
412 "fastq" => {
413 return Ok(Box::new(FastQFile::open(config, path)?));
414 }
415 other => {
416 return Err(io::Error::new(
417 io::ErrorKind::InvalidInput,
418 format!("Unknown sequence format: {}", other),
419 ));
420 }
421 }
422 }
423
424 // Extension-based detection from SequenceFactory.java
425 let name_lower = path
426 .file_name()
427 .map(|n| n.to_string_lossy().to_lowercase())
428 .unwrap_or_default();
429
430 if name_lower.ends_with(".bam") || name_lower.ends_with(".ubam") {
431 // .bam and .ubam files are opened as BAM with all reads
432 Ok(Box::new(BAMFile::open(path, true, false)?))
433 } else if name_lower.ends_with(".sam") {
434 // .sam files are opened as SAM with all reads
435 Ok(Box::new(BAMFile::open(path, false, false)?))
436 } else if name_lower.ends_with(".fast5") {
437 // Open Fast5 (Nanopore HDF5) files via pure-Rust hdf5-pure crate
438 Ok(Box::new(super::fast5::Fast5File::open(path)?))
439 } else {
440 // Everything else is assumed to be FASTQ
441 Ok(Box::new(FastQFile::open(config, path)?))
442 }
443}
444
445// ---------------------------------------------------------------------------
446// Tests
447// ---------------------------------------------------------------------------
448
449#[cfg(test)]
450mod tests {
451 use super::*;
452
453 #[test]
454 fn test_format_detection_fast5() {
455 // Fast5 files are now supported but still fail if the file doesn't exist
456 let config = crate::config::FastQCConfig::default();
457 let result = open_sequence_file(&config, Path::new("nonexistent.fast5"));
458 assert!(result.is_err());
459 }
460
461 #[test]
462 fn test_format_detection_unknown_format() {
463 let config = crate::config::FastQCConfig {
464 sequence_format: Some("unknown_format".to_string()),
465 ..Default::default()
466 };
467 let result = open_sequence_file(&config, Path::new("test.dat"));
468 assert!(result.is_err());
469 }
470}