fastqc_rust/sequence/fastq.rs
1// FASTQ file reader
2// Corresponds to Sequence/FastQFile.java
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader, Read, Seek, Stdin};
6use std::path::Path;
7
8use bzip2_rs::DecoderReader;
9use flate2::read::MultiGzDecoder;
10
11use super::{Sequence, SequenceFile};
12use crate::config::FastQCConfig;
13
14// ---------------------------------------------------------------------------
15// Decompression layer
16// ---------------------------------------------------------------------------
17
18/// Wrapper enum so we can store different reader types without trait objects.
19/// Each variant wraps a `BufReader` around the appropriate decompression stream.
20enum ReaderKind {
21 Plain(BufReader<File>),
22 Gzip(BufReader<MultiGzDecoder<File>>),
23 Bzip2(Box<BufReader<DecoderReader<File>>>),
24 Stdin(BufReader<Stdin>),
25}
26
27impl BufRead for ReaderKind {
28 fn fill_buf(&mut self) -> io::Result<&[u8]> {
29 match self {
30 ReaderKind::Plain(r) => r.fill_buf(),
31 ReaderKind::Gzip(r) => r.fill_buf(),
32 ReaderKind::Bzip2(r) => r.fill_buf(),
33 ReaderKind::Stdin(r) => r.fill_buf(),
34 }
35 }
36
37 fn consume(&mut self, amt: usize) {
38 match self {
39 ReaderKind::Plain(r) => r.consume(amt),
40 ReaderKind::Gzip(r) => r.consume(amt),
41 ReaderKind::Bzip2(r) => r.consume(amt),
42 ReaderKind::Stdin(r) => r.consume(amt),
43 }
44 }
45}
46
47impl Read for ReaderKind {
48 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
49 match self {
50 ReaderKind::Plain(r) => r.read(buf),
51 ReaderKind::Gzip(r) => r.read(buf),
52 ReaderKind::Bzip2(r) => r.read(buf),
53 ReaderKind::Stdin(r) => r.read(buf),
54 }
55 }
56}
57
58// ---------------------------------------------------------------------------
59// Compression detection
60// ---------------------------------------------------------------------------
61
62/// Detect compression from the first two bytes (magic numbers).
63/// Returns "gz", "bz2", or "none".
64fn detect_compression_from_magic(path: &Path) -> io::Result<&'static str> {
65 let mut f = File::open(path)?;
66 let mut magic = [0u8; 2];
67 let n = f.read(&mut magic)?;
68 if n >= 2 {
69 // Java detects gzip via file extension or MIME type probing which
70 // checks magic bytes 1f 8b internally. We replicate by checking magic directly.
71 if magic[0] == 0x1f && magic[1] == 0x8b {
72 return Ok("gz");
73 }
74 // Java only checks .bz2 extension, but we also check magic bytes
75 // 42 5a ('BZ') for robustness.
76 if magic[0] == 0x42 && magic[1] == 0x5a {
77 return Ok("bz2");
78 }
79 }
80 Ok("none")
81}
82
83// ---------------------------------------------------------------------------
84// FastQFile
85// ---------------------------------------------------------------------------
86
87/// FASTQ file reader that supports plain text, gzip, and bzip2 compressed files.
88///
89/// Mirrors `Sequence.FastQFile` in Java. Uses a look-ahead design
90/// where `readNext()` is called at construction time and after each `next()` call,
91/// so `hasNext()` can report whether more sequences are available. In Rust we use
92/// `Option<Sequence>` stored in `next_sequence` for the same pattern.
93pub struct FastQFile {
94 reader: ReaderKind,
95 name: String,
96 file_size: u64,
97 /// Cloned file handle used solely to query the compressed byte position
98 /// via seek(Current). Java does this with fis.getChannel().position().
99 /// None for stdin (no file to track).
100 position_handle: Option<File>,
101
102 /// The next sequence ready to be returned (look-ahead buffer).
103 next_sequence: Option<Sequence>,
104
105 /// Current line number for error messages, incremented on every
106 /// `readLine()` call exactly as in Java.
107 line_number: u64,
108
109 /// Whether colorspace was detected (checked on the first sequence only).
110 is_colorspace: bool,
111 /// Whether we have already checked for colorspace (first record only).
112 colorspace_checked: bool,
113
114 /// CASAVA filter mode flags.
115 casava_mode: bool,
116 nofilter: bool,
117
118 /// The lowest raw quality character seen so far (for Phred encoding detection).
119 pub lowest_char: u8,
120
121 /// A reusable String buffer to avoid allocating on every `read_line`.
122 line_buf: String,
123}
124
125impl FastQFile {
126 /// Open a FASTQ file for reading.
127 ///
128 /// The Java constructor opens the file, wraps it in the
129 /// appropriate decompression stream, and immediately calls `readNext()` to
130 /// prime the look-ahead buffer.
131 pub fn open<P: AsRef<Path>>(config: &FastQCConfig, path: P) -> io::Result<Self> {
132 let path = path.as_ref();
133 let name = path
134 .file_name()
135 .map(|n| n.to_string_lossy().into_owned())
136 .unwrap_or_else(|| path.to_string_lossy().into_owned());
137
138 let is_stdin = name.starts_with("stdin");
139
140 // For stdin, Java sets fileSize to Long.MAX_VALUE.
141 let file_size = if is_stdin {
142 u64::MAX
143 } else {
144 std::fs::metadata(path)?.len()
145 };
146
147 // Java keeps the raw FileInputStream (fis) and queries
148 // fis.getChannel().position() for progress tracking. We clone the File
149 // handle before wrapping it in decompression so we can seek on the clone
150 // to get the compressed byte position.
151 let (reader, position_handle) = if is_stdin {
152 (ReaderKind::Stdin(BufReader::new(io::stdin())), None)
153 } else {
154 let lower_name = name.to_lowercase();
155 let compression = if lower_name.ends_with(".gz") {
156 "gz"
157 } else if lower_name.ends_with(".bz2") {
158 "bz2"
159 } else {
160 detect_compression_from_magic(path)?
161 };
162
163 let file = File::open(path)?;
164 let pos_handle = file.try_clone()?;
165
166 let rdr = match compression {
167 "gz" => ReaderKind::Gzip(BufReader::new(MultiGzDecoder::new(file))),
168 "bz2" => ReaderKind::Bzip2(Box::new(BufReader::new(DecoderReader::new(file)))),
169 _ => ReaderKind::Plain(BufReader::new(file)),
170 };
171 (rdr, Some(pos_handle))
172 };
173
174 let casava_mode = config.casava;
175 let nofilter = config.nofilter;
176
177 let mut fq = FastQFile {
178 reader,
179 name,
180 file_size,
181 position_handle,
182 next_sequence: None,
183 line_number: 0,
184 is_colorspace: false,
185 colorspace_checked: false,
186 casava_mode,
187 nofilter,
188 lowest_char: 255,
189 line_buf: String::with_capacity(512),
190 };
191
192 // Prime the look-ahead buffer by reading the first record.
193 fq.read_next()?;
194
195 Ok(fq)
196 }
197
198 /// Read a single line into `self.line_buf`, incrementing `line_number`.
199 /// Returns `true` if a line was read, `false` at EOF.
200 fn read_line(&mut self) -> io::Result<bool> {
201 self.line_buf.clear();
202 let n = self.reader.read_line(&mut self.line_buf)?;
203 self.line_number += 1;
204 if n == 0 {
205 return Ok(false);
206 }
207 // Strip trailing newline / carriage return
208 while self.line_buf.ends_with('\n') || self.line_buf.ends_with('\r') {
209 self.line_buf.pop();
210 }
211 Ok(true)
212 }
213
214 /// Read the next FASTQ record into `self.next_sequence`.
215 ///
216 /// This mirrors `readNext()` in the Java code, including:
217 /// - Skipping blank lines between records
218 /// - Validating the '@' prefix on the ID line
219 /// - Validating the '+' prefix on the mid-line
220 /// - Colorspace detection on the first record only
221 /// - CASAVA filter detection via `:Y:` in the read ID
222 fn read_next(&mut self) -> io::Result<()> {
223 // -- ID line (skip blank lines) --
224 // The Java code loops reading lines until it finds a non-empty
225 // one or hits EOF. Blank lines between records are silently skipped.
226 loop {
227 if !self.read_line()? {
228 // EOF
229 self.next_sequence = None;
230 return Ok(());
231 }
232 if !self.line_buf.is_empty() {
233 break;
234 }
235 }
236
237 if !self.line_buf.starts_with('@') {
238 return Err(io::Error::new(
239 io::ErrorKind::InvalidData,
240 format!("ID line didn't start with '@' at line {}", self.line_number),
241 ));
242 }
243 // Clone the ID string and clear line_buf, preserving its heap allocation
244 // for reuse on subsequent read_line calls. Using std::mem::take here would
245 // leave line_buf with zero capacity, forcing a new allocation every line --
246 // 3 wasted allocations per record across millions of reads.
247 let id = self.line_buf.clone();
248 self.line_buf.clear();
249
250 // -- Sequence line --
251 if !self.read_line()? {
252 return Err(io::Error::new(
253 io::ErrorKind::UnexpectedEof,
254 "Ran out of data in the middle of a fastq entry. Your file is probably truncated",
255 ));
256 }
257 let seq_bytes = self.line_buf.as_bytes().to_vec();
258 self.line_buf.clear();
259
260 // -- Mid-line ('+' line) --
261 if !self.read_line()? {
262 return Err(io::Error::new(
263 io::ErrorKind::UnexpectedEof,
264 "Ran out of data in the middle of a fastq entry. Your file is probably truncated",
265 ));
266 }
267 if !self.line_buf.starts_with('+') {
268 return Err(io::Error::new(
269 io::ErrorKind::InvalidData,
270 format!(
271 "Midline '{}' didn't start with '+' at {}",
272 self.line_buf, self.line_number
273 ),
274 ));
275 }
276 // Mid-line is not needed; just clear the buffer (keeping its allocation)
277 self.line_buf.clear();
278
279 // -- Quality line --
280 if !self.read_line()? {
281 return Err(io::Error::new(
282 io::ErrorKind::UnexpectedEof,
283 "Ran out of data in the middle of a fastq entry. Your file is probably truncated",
284 ));
285 }
286 let quality_bytes = self.line_buf.as_bytes().to_vec();
287 self.line_buf.clear();
288
289 // Track lowest quality character for Phred encoding detection.
290 for &b in &quality_bytes {
291 if b < self.lowest_char {
292 self.lowest_char = b;
293 }
294 }
295
296 // -- Colorspace detection (first record only) --
297 // Java checks only the very first sequence for colorspace and
298 // then assumes the rest of the file is the same. The check is that
299 // `nextSequence` is null (i.e. no prior record) and `seq` is non-null.
300 if !self.colorspace_checked {
301 self.colorspace_checked = true;
302 // Safety: seq_bytes originated from a valid UTF-8 String
303 let seq_str = std::str::from_utf8(&seq_bytes).unwrap_or("");
304 self.is_colorspace = check_colorspace(seq_str);
305 }
306
307 // -- CASAVA filtering --
308 // If running in --casava mode without --nofilter, check the
309 // ID for `:Y:` anywhere after position 0 and flag the sequence as filtered.
310 let is_filtered =
311 self.casava_mode && !self.nofilter && id.find(":Y:").is_some_and(|pos| pos > 0);
312
313 // Build the Sequence
314 let mut sequence = if self.is_colorspace {
315 // For colorspace, `seq.toUpperCase()` is passed to both
316 // `convertColorspaceToBases` and stored as `colorspaceSequence`.
317 // Safety: seq_bytes originated from a valid UTF-8 String
318 let seq_str = String::from_utf8(seq_bytes).unwrap_or_default();
319 let upper = seq_str.to_ascii_uppercase();
320 let bases = convert_colorspace_to_bases(&upper);
321 let mut s = Sequence::new(id, bases.into_bytes(), quality_bytes);
322 s.colorspace = Some(upper.into_bytes());
323 s
324 } else {
325 // Normal path - Java calls `new Sequence(this, seq.toUpperCase(), quality, id)`.
326 // The `Sequence::new` constructor already uppercases, matching Java.
327 Sequence::new(id, seq_bytes, quality_bytes)
328 };
329
330 sequence.is_filtered = is_filtered;
331 self.next_sequence = Some(sequence);
332
333 Ok(())
334 }
335}
336
337impl SequenceFile for FastQFile {
338 fn next(&mut self) -> Option<io::Result<Sequence>> {
339 // Java's `next()` returns the current `nextSequence` then calls
340 // `readNext()` to prime the next one. We do the same.
341 let current = self.next_sequence.take()?;
342 if let Err(e) = self.read_next() {
343 // Store nothing for next time; the error is returned on the *next* call
344 // would be confusing. Instead, return the error now and let the current
345 // sequence be lost (matching Java which throws from next()).
346 // Actually, Java's next() calls readNext() but returns the previous value.
347 // If readNext() throws, the exception propagates out of next().
348 // We replicate: return the error, dropping `current`.
349 return Some(Err(e));
350 }
351 Some(Ok(current))
352 }
353
354 fn name(&self) -> &str {
355 &self.name
356 }
357
358 fn is_colorspace(&self) -> bool {
359 self.is_colorspace
360 }
361
362 /// Java reads `fis.getChannel().position()` which gives the
363 /// *compressed* byte position, then divides by file size (also compressed).
364 /// For plain files this is exact; for compressed files it gives a rough
365 /// estimate based on compressed bytes consumed.
366 ///
367 /// For stdin, Java always returns 0 until EOF then 100. We replicate that.
368 fn percent_complete(&self) -> f64 {
369 if self.next_sequence.is_none() {
370 return 100.0;
371 }
372 if self.name.starts_with("stdin") {
373 return 0.0;
374 }
375 // Java queries fis.getChannel().position() on the raw FileInputStream
376 // to get the compressed byte position, then divides by fileSize.
377 // We do the same via a cloned file handle using seek(Current).
378 if let Some(ref handle) = self.position_handle {
379 // try_clone to get a mutable handle without requiring &mut self
380 if let Ok(mut h) = handle.try_clone() {
381 if let Ok(pos) = h.stream_position() {
382 return (pos as f64 / self.file_size as f64) * 100.0;
383 }
384 }
385 }
386 0.0
387 }
388}
389
390// ---------------------------------------------------------------------------
391// Colorspace helpers
392// ---------------------------------------------------------------------------
393
394/// Check whether a sequence string is colorspace (SOLiD) format.
395///
396/// Uses the exact same regex `^[GATCNgatcn][\.0123456]+$` as Java.
397/// We implement it manually instead of pulling in a regex crate.
398fn check_colorspace(seq: &str) -> bool {
399 let bytes = seq.as_bytes();
400 if bytes.len() < 2 {
401 return false;
402 }
403 // First character must be a DNA base
404 if !matches!(
405 bytes[0],
406 b'G' | b'A' | b'T' | b'C' | b'N' | b'g' | b'a' | b't' | b'c' | b'n'
407 ) {
408 return false;
409 }
410 // Remaining characters must be '.', '0'-'6'
411 for &b in &bytes[1..] {
412 if !matches!(b, b'.' | b'0'..=b'6') {
413 return false;
414 }
415 }
416 true
417}
418
419/// Convert a colorspace sequence to base-space.
420///
421/// This is a direct translation of `convertColorspaceToBases()` from
422/// FastQFile.java, preserving the exact same lookup table and the behavior where
423/// encountering '.', '4', '5', or '6' causes all remaining positions to become 'N'.
424fn convert_colorspace_to_bases(s: &str) -> String {
425 let cs: Vec<u8> = s.as_bytes().to_vec();
426
427 // Java returns "" for zero-length input.
428 if cs.is_empty() {
429 return String::new();
430 }
431
432 // Output is one shorter than input (the leading reference base is consumed).
433 let mut bp = vec![0u8; cs.len() - 1];
434
435 for i in 1..cs.len() {
436 let ref_base = if i == 1 {
437 // First iteration uses cs[0] (the leading reference base).
438 cs[0]
439 } else {
440 // Subsequent iterations use the *previous output* base.
441 bp[i - 2]
442 };
443
444 // If refBase is not a valid DNA letter, Java throws
445 // IllegalArgumentException. We replicate with a panic for now, but
446 // callers should ensure valid input.
447 debug_assert!(
448 matches!(ref_base, b'G' | b'A' | b'T' | b'C'),
449 "Colorspace sequence data should always start with a real DNA letter, got '{}'",
450 ref_base as char,
451 );
452
453 // The colorspace-to-base lookup table. Each color digit
454 // encodes a transition from the reference base:
455 // 0 = same base, 1 = transversion1, 2 = transition, 3 = transversion2
456 // '.', '4', '5', '6' = unknown -> fill rest with N
457 bp[i - 1] = match cs[i] {
458 b'0' => ref_base, // same base
459 b'1' => match ref_base {
460 b'A' => b'C',
461 b'C' => b'A',
462 b'G' => b'T',
463 b'T' => b'G',
464 _ => b'N',
465 },
466 b'2' => match ref_base {
467 b'A' => b'G',
468 b'G' => b'A',
469 b'C' => b'T',
470 b'T' => b'C',
471 _ => b'N',
472 },
473 b'3' => match ref_base {
474 b'A' => b'T',
475 b'T' => b'A',
476 b'G' => b'C',
477 b'C' => b'G',
478 _ => b'N',
479 },
480 // '.', '4', '5', '6' cause all *remaining* positions
481 // (including the current one) to be set to 'N'. Java does this with
482 // a for-loop from the current `i` to end.
483 b'.' | b'4' | b'5' | b'6' => {
484 for b in &mut bp[(i - 1)..] {
485 *b = b'N';
486 }
487 break;
488 }
489 other => {
490 // Java throws IllegalArgumentException for unexpected chars.
491 panic!("Unexpected colorspace char '{}'", other as char);
492 }
493 };
494 }
495
496 // Safety: bp contains only ASCII DNA letters or 'N'
497 String::from_utf8(bp).expect("colorspace output should be valid UTF-8")
498}
499
500// ---------------------------------------------------------------------------
501// Tests
502// ---------------------------------------------------------------------------
503
504#[cfg(test)]
505mod tests {
506 use super::*;
507
508 // ---- Colorspace helpers ----
509
510 #[test]
511 fn test_check_colorspace_positive() {
512 assert!(check_colorspace("G0123456"));
513 assert!(check_colorspace("A.012"));
514 assert!(check_colorspace("t00"));
515 }
516
517 #[test]
518 fn test_check_colorspace_negative() {
519 assert!(!check_colorspace("ACGTACGT"));
520 assert!(!check_colorspace("A")); // too short
521 assert!(!check_colorspace(""));
522 assert!(!check_colorspace("X012")); // invalid lead
523 }
524
525 #[test]
526 fn test_convert_colorspace_basic() {
527 // A0 -> same as A = A
528 assert_eq!(convert_colorspace_to_bases("A0"), "A");
529 // A1 -> A->C
530 assert_eq!(convert_colorspace_to_bases("A1"), "C");
531 // A2 -> A->G
532 assert_eq!(convert_colorspace_to_bases("A2"), "G");
533 // A3 -> A->T
534 assert_eq!(convert_colorspace_to_bases("A3"), "T");
535 }
536
537 #[test]
538 fn test_convert_colorspace_chained() {
539 // A00 -> A,A (ref=A->A, then ref=A->A)
540 assert_eq!(convert_colorspace_to_bases("A00"), "AA");
541 // A01 -> A, C (ref=A->A, then ref=A->C)
542 assert_eq!(convert_colorspace_to_bases("A01"), "AC");
543 // G10 -> T, T (ref=G->T, then ref=T->T)
544 assert_eq!(convert_colorspace_to_bases("G10"), "TT");
545 }
546
547 #[test]
548 fn test_convert_colorspace_unknown_fills_n() {
549 // '.' causes rest to be N
550 assert_eq!(convert_colorspace_to_bases("A.12"), "NNN");
551 // '4' also fills rest with N
552 assert_eq!(convert_colorspace_to_bases("A04"), "AN");
553 }
554
555 #[test]
556 fn test_convert_colorspace_empty() {
557 assert_eq!(convert_colorspace_to_bases(""), "");
558 }
559
560 // ---- FastQFile reading ----
561
562 #[test]
563 fn test_read_minimal_fastq() {
564 let config = FastQCConfig::default();
565 let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
566 let mut reader = FastQFile::open(&config, path).unwrap();
567
568 // Should have exactly one record
569 let seq = reader.next().unwrap().unwrap();
570 assert_eq!(seq.id, "@READ0001");
571 assert_eq!(seq.sequence, b"AAAAAAAAAAAAAAAA");
572 assert_eq!(seq.quality, b"IIIIIIIIIIIIIIII");
573 assert!(!seq.is_filtered);
574 assert!(!reader.is_colorspace());
575
576 // No more records
577 assert!(reader.next().is_none());
578 }
579
580 #[test]
581 fn test_read_complex_fastq() {
582 let config = FastQCConfig::default();
583 let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/complex.fastq");
584 let mut reader = FastQFile::open(&config, path).unwrap();
585
586 let mut count = 0;
587 while let Some(result) = reader.next() {
588 let seq = result.unwrap();
589 count += 1;
590 // All reads in complex.fastq have the same sequence and quality
591 assert_eq!(seq.sequence, b"ACGTACGTACGTACGT");
592 assert_eq!(seq.quality, b"IIIIIIIIIIIIIIII");
593 // IDs are @READ0001 through @READ0005
594 assert_eq!(seq.id, format!("@READ{:04}", count));
595 }
596 assert_eq!(count, 5);
597 }
598
599 #[test]
600 fn test_lowest_char_tracking() {
601 let config = FastQCConfig::default();
602 let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
603 let mut reader = FastQFile::open(&config, path).unwrap();
604
605 // Consume all records
606 while reader.next().is_some() {}
607
608 // 'I' is ASCII 73
609 assert_eq!(reader.lowest_char, b'I');
610 }
611
612 #[test]
613 fn test_casava_filter_detection() {
614 // We can't easily create a temp file in a unit test without extra deps,
615 // so we test the CASAVA logic by constructing a reader over a known file.
616 // The test files don't have :Y: in the ID, so nothing should be filtered.
617 let config = FastQCConfig {
618 casava: true,
619 nofilter: false,
620 ..FastQCConfig::default()
621 };
622 let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
623 let mut reader = FastQFile::open(&config, path).unwrap();
624 let seq = reader.next().unwrap().unwrap();
625 // "@READ0001" has no ":Y:", so not filtered
626 assert!(!seq.is_filtered);
627 }
628
629 #[test]
630 fn test_sequence_uppercase() {
631 // Java uppercases the sequence. Our Sequence::new does the same.
632 let seq = Sequence::new(
633 "@test".to_string(),
634 b"acgtACGT".to_vec(),
635 b"IIIIIIII".to_vec(),
636 );
637 assert_eq!(seq.sequence, b"ACGTACGT");
638 }
639}