Skip to main content

fastqc_rust/sequence/
fastq.rs

1// FASTQ file reader
2// Corresponds to Sequence/FastQFile.java
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader, Read, Seek, Stdin};
6use std::path::Path;
7
8use bzip2_rs::DecoderReader;
9use flate2::read::MultiGzDecoder;
10
11use super::{Sequence, SequenceFile};
12use crate::config::FastQCConfig;
13
14// ---------------------------------------------------------------------------
15// Decompression layer
16// ---------------------------------------------------------------------------
17
18/// Wrapper enum so we can store different reader types without trait objects.
19/// Each variant wraps a `BufReader` around the appropriate decompression stream.
20enum ReaderKind {
21    Plain(BufReader<File>),
22    Gzip(BufReader<MultiGzDecoder<File>>),
23    Bzip2(Box<BufReader<DecoderReader<File>>>),
24    Stdin(BufReader<Stdin>),
25}
26
27impl BufRead for ReaderKind {
28    fn fill_buf(&mut self) -> io::Result<&[u8]> {
29        match self {
30            ReaderKind::Plain(r) => r.fill_buf(),
31            ReaderKind::Gzip(r) => r.fill_buf(),
32            ReaderKind::Bzip2(r) => r.fill_buf(),
33            ReaderKind::Stdin(r) => r.fill_buf(),
34        }
35    }
36
37    fn consume(&mut self, amt: usize) {
38        match self {
39            ReaderKind::Plain(r) => r.consume(amt),
40            ReaderKind::Gzip(r) => r.consume(amt),
41            ReaderKind::Bzip2(r) => r.consume(amt),
42            ReaderKind::Stdin(r) => r.consume(amt),
43        }
44    }
45}
46
47impl Read for ReaderKind {
48    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
49        match self {
50            ReaderKind::Plain(r) => r.read(buf),
51            ReaderKind::Gzip(r) => r.read(buf),
52            ReaderKind::Bzip2(r) => r.read(buf),
53            ReaderKind::Stdin(r) => r.read(buf),
54        }
55    }
56}
57
58// ---------------------------------------------------------------------------
59// Compression detection
60// ---------------------------------------------------------------------------
61
62/// Detect compression from the first two bytes (magic numbers).
63/// Returns "gz", "bz2", or "none".
64fn detect_compression_from_magic(path: &Path) -> io::Result<&'static str> {
65    let mut f = File::open(path)?;
66    let mut magic = [0u8; 2];
67    let n = f.read(&mut magic)?;
68    if n >= 2 {
69        // Java detects gzip via file extension or MIME type probing which
70        // checks magic bytes 1f 8b internally. We replicate by checking magic directly.
71        if magic[0] == 0x1f && magic[1] == 0x8b {
72            return Ok("gz");
73        }
74        // Java only checks .bz2 extension, but we also check magic bytes
75        // 42 5a ('BZ') for robustness.
76        if magic[0] == 0x42 && magic[1] == 0x5a {
77            return Ok("bz2");
78        }
79    }
80    Ok("none")
81}
82
83// ---------------------------------------------------------------------------
84// FastQFile
85// ---------------------------------------------------------------------------
86
87/// FASTQ file reader that supports plain text, gzip, and bzip2 compressed files.
88///
89/// Mirrors `Sequence.FastQFile` in Java. Uses a look-ahead design
90/// where `readNext()` is called at construction time and after each `next()` call,
91/// so `hasNext()` can report whether more sequences are available. In Rust we use
92/// `Option<Sequence>` stored in `next_sequence` for the same pattern.
93pub struct FastQFile {
94    reader: ReaderKind,
95    name: String,
96    file_size: u64,
97    /// Cloned file handle used solely to query the compressed byte position
98    /// via seek(Current). Java does this with fis.getChannel().position().
99    /// None for stdin (no file to track).
100    position_handle: Option<File>,
101
102    /// The next sequence ready to be returned (look-ahead buffer).
103    next_sequence: Option<Sequence>,
104
105    /// Current line number for error messages, incremented on every
106    /// `readLine()` call exactly as in Java.
107    line_number: u64,
108
109    /// Whether colorspace was detected (checked on the first sequence only).
110    is_colorspace: bool,
111    /// Whether we have already checked for colorspace (first record only).
112    colorspace_checked: bool,
113
114    /// CASAVA filter mode flags.
115    casava_mode: bool,
116    nofilter: bool,
117
118    /// The lowest raw quality character seen so far (for Phred encoding detection).
119    pub lowest_char: u8,
120
121    /// A reusable String buffer to avoid allocating on every `read_line`.
122    line_buf: String,
123}
124
125impl FastQFile {
126    /// Open a FASTQ file for reading.
127    ///
128    /// The Java constructor opens the file, wraps it in the
129    /// appropriate decompression stream, and immediately calls `readNext()` to
130    /// prime the look-ahead buffer.
131    pub fn open<P: AsRef<Path>>(config: &FastQCConfig, path: P) -> io::Result<Self> {
132        let path = path.as_ref();
133        let name = path
134            .file_name()
135            .map(|n| n.to_string_lossy().into_owned())
136            .unwrap_or_else(|| path.to_string_lossy().into_owned());
137
138        let is_stdin = name.starts_with("stdin");
139
140        // For stdin, Java sets fileSize to Long.MAX_VALUE.
141        let file_size = if is_stdin {
142            u64::MAX
143        } else {
144            std::fs::metadata(path)?.len()
145        };
146
147        // Java keeps the raw FileInputStream (fis) and queries
148        // fis.getChannel().position() for progress tracking. We clone the File
149        // handle before wrapping it in decompression so we can seek on the clone
150        // to get the compressed byte position.
151        let (reader, position_handle) = if is_stdin {
152            (ReaderKind::Stdin(BufReader::new(io::stdin())), None)
153        } else {
154            let lower_name = name.to_lowercase();
155            let compression = if lower_name.ends_with(".gz") {
156                "gz"
157            } else if lower_name.ends_with(".bz2") {
158                "bz2"
159            } else {
160                detect_compression_from_magic(path)?
161            };
162
163            let file = File::open(path)?;
164            let pos_handle = file.try_clone()?;
165
166            let rdr = match compression {
167                "gz" => ReaderKind::Gzip(BufReader::new(MultiGzDecoder::new(file))),
168                "bz2" => ReaderKind::Bzip2(Box::new(BufReader::new(DecoderReader::new(file)))),
169                _ => ReaderKind::Plain(BufReader::new(file)),
170            };
171            (rdr, Some(pos_handle))
172        };
173
174        let casava_mode = config.casava;
175        let nofilter = config.nofilter;
176
177        let mut fq = FastQFile {
178            reader,
179            name,
180            file_size,
181            position_handle,
182            next_sequence: None,
183            line_number: 0,
184            is_colorspace: false,
185            colorspace_checked: false,
186            casava_mode,
187            nofilter,
188            lowest_char: 255,
189            line_buf: String::with_capacity(512),
190        };
191
192        // Prime the look-ahead buffer by reading the first record.
193        fq.read_next()?;
194
195        Ok(fq)
196    }
197
198    /// Read a single line into `self.line_buf`, incrementing `line_number`.
199    /// Returns `true` if a line was read, `false` at EOF.
200    fn read_line(&mut self) -> io::Result<bool> {
201        self.line_buf.clear();
202        let n = self.reader.read_line(&mut self.line_buf)?;
203        self.line_number += 1;
204        if n == 0 {
205            return Ok(false);
206        }
207        // Strip trailing newline / carriage return
208        while self.line_buf.ends_with('\n') || self.line_buf.ends_with('\r') {
209            self.line_buf.pop();
210        }
211        Ok(true)
212    }
213
214    /// Read the next FASTQ record into `self.next_sequence`.
215    ///
216    /// This mirrors `readNext()` in the Java code, including:
217    /// - Skipping blank lines between records
218    /// - Validating the '@' prefix on the ID line
219    /// - Validating the '+' prefix on the mid-line
220    /// - Colorspace detection on the first record only
221    /// - CASAVA filter detection via `:Y:` in the read ID
222    fn read_next(&mut self) -> io::Result<()> {
223        // -- ID line (skip blank lines) --
224        // The Java code loops reading lines until it finds a non-empty
225        // one or hits EOF. Blank lines between records are silently skipped.
226        loop {
227            if !self.read_line()? {
228                // EOF
229                self.next_sequence = None;
230                return Ok(());
231            }
232            if !self.line_buf.is_empty() {
233                break;
234            }
235        }
236
237        if !self.line_buf.starts_with('@') {
238            return Err(io::Error::new(
239                io::ErrorKind::InvalidData,
240                format!("ID line didn't start with '@' at line {}", self.line_number),
241            ));
242        }
243        // Clone the ID string and clear line_buf, preserving its heap allocation
244        // for reuse on subsequent read_line calls. Using std::mem::take here would
245        // leave line_buf with zero capacity, forcing a new allocation every line --
246        // 3 wasted allocations per record across millions of reads.
247        let id = self.line_buf.clone();
248        self.line_buf.clear();
249
250        // -- Sequence line --
251        if !self.read_line()? {
252            return Err(io::Error::new(
253                io::ErrorKind::UnexpectedEof,
254                "Ran out of data in the middle of a fastq entry. Your file is probably truncated",
255            ));
256        }
257        let seq_bytes = self.line_buf.as_bytes().to_vec();
258        self.line_buf.clear();
259
260        // -- Mid-line ('+' line) --
261        if !self.read_line()? {
262            return Err(io::Error::new(
263                io::ErrorKind::UnexpectedEof,
264                "Ran out of data in the middle of a fastq entry. Your file is probably truncated",
265            ));
266        }
267        if !self.line_buf.starts_with('+') {
268            return Err(io::Error::new(
269                io::ErrorKind::InvalidData,
270                format!(
271                    "Midline '{}' didn't start with '+' at {}",
272                    self.line_buf, self.line_number
273                ),
274            ));
275        }
276        // Mid-line is not needed; just clear the buffer (keeping its allocation)
277        self.line_buf.clear();
278
279        // -- Quality line --
280        if !self.read_line()? {
281            return Err(io::Error::new(
282                io::ErrorKind::UnexpectedEof,
283                "Ran out of data in the middle of a fastq entry. Your file is probably truncated",
284            ));
285        }
286        let quality_bytes = self.line_buf.as_bytes().to_vec();
287        self.line_buf.clear();
288
289        // Track lowest quality character for Phred encoding detection.
290        for &b in &quality_bytes {
291            if b < self.lowest_char {
292                self.lowest_char = b;
293            }
294        }
295
296        // -- Colorspace detection (first record only) --
297        // Java checks only the very first sequence for colorspace and
298        // then assumes the rest of the file is the same. The check is that
299        // `nextSequence` is null (i.e. no prior record) and `seq` is non-null.
300        if !self.colorspace_checked {
301            self.colorspace_checked = true;
302            // Safety: seq_bytes originated from a valid UTF-8 String
303            let seq_str = std::str::from_utf8(&seq_bytes).unwrap_or("");
304            self.is_colorspace = check_colorspace(seq_str);
305        }
306
307        // -- CASAVA filtering --
308        // If running in --casava mode without --nofilter, check the
309        // ID for `:Y:` anywhere after position 0 and flag the sequence as filtered.
310        let is_filtered =
311            self.casava_mode && !self.nofilter && id.find(":Y:").is_some_and(|pos| pos > 0);
312
313        // Build the Sequence
314        let mut sequence = if self.is_colorspace {
315            // For colorspace, `seq.toUpperCase()` is passed to both
316            // `convertColorspaceToBases` and stored as `colorspaceSequence`.
317            // Safety: seq_bytes originated from a valid UTF-8 String
318            let seq_str = String::from_utf8(seq_bytes).unwrap_or_default();
319            let upper = seq_str.to_ascii_uppercase();
320            let bases = convert_colorspace_to_bases(&upper);
321            let mut s = Sequence::new(id, bases.into_bytes(), quality_bytes);
322            s.colorspace = Some(upper.into_bytes());
323            s
324        } else {
325            // Normal path - Java calls `new Sequence(this, seq.toUpperCase(), quality, id)`.
326            // The `Sequence::new` constructor already uppercases, matching Java.
327            Sequence::new(id, seq_bytes, quality_bytes)
328        };
329
330        sequence.is_filtered = is_filtered;
331        self.next_sequence = Some(sequence);
332
333        Ok(())
334    }
335}
336
337impl SequenceFile for FastQFile {
338    fn next(&mut self) -> Option<io::Result<Sequence>> {
339        // Java's `next()` returns the current `nextSequence` then calls
340        // `readNext()` to prime the next one. We do the same.
341        let current = self.next_sequence.take()?;
342        if let Err(e) = self.read_next() {
343            // Store nothing for next time; the error is returned on the *next* call
344            // would be confusing. Instead, return the error now and let the current
345            // sequence be lost (matching Java which throws from next()).
346            // Actually, Java's next() calls readNext() but returns the previous value.
347            // If readNext() throws, the exception propagates out of next().
348            // We replicate: return the error, dropping `current`.
349            return Some(Err(e));
350        }
351        Some(Ok(current))
352    }
353
354    fn name(&self) -> &str {
355        &self.name
356    }
357
358    fn is_colorspace(&self) -> bool {
359        self.is_colorspace
360    }
361
362    /// Java reads `fis.getChannel().position()` which gives the
363    /// *compressed* byte position, then divides by file size (also compressed).
364    /// For plain files this is exact; for compressed files it gives a rough
365    /// estimate based on compressed bytes consumed.
366    ///
367    /// For stdin, Java always returns 0 until EOF then 100. We replicate that.
368    fn percent_complete(&self) -> f64 {
369        if self.next_sequence.is_none() {
370            return 100.0;
371        }
372        if self.name.starts_with("stdin") {
373            return 0.0;
374        }
375        // Java queries fis.getChannel().position() on the raw FileInputStream
376        // to get the compressed byte position, then divides by fileSize.
377        // We do the same via a cloned file handle using seek(Current).
378        if let Some(ref handle) = self.position_handle {
379            // try_clone to get a mutable handle without requiring &mut self
380            if let Ok(mut h) = handle.try_clone() {
381                if let Ok(pos) = h.stream_position() {
382                    return (pos as f64 / self.file_size as f64) * 100.0;
383                }
384            }
385        }
386        0.0
387    }
388}
389
390// ---------------------------------------------------------------------------
391// Colorspace helpers
392// ---------------------------------------------------------------------------
393
394/// Check whether a sequence string is colorspace (SOLiD) format.
395///
396/// Uses the exact same regex `^[GATCNgatcn][\.0123456]+$` as Java.
397/// We implement it manually instead of pulling in a regex crate.
398fn check_colorspace(seq: &str) -> bool {
399    let bytes = seq.as_bytes();
400    if bytes.len() < 2 {
401        return false;
402    }
403    // First character must be a DNA base
404    if !matches!(
405        bytes[0],
406        b'G' | b'A' | b'T' | b'C' | b'N' | b'g' | b'a' | b't' | b'c' | b'n'
407    ) {
408        return false;
409    }
410    // Remaining characters must be '.', '0'-'6'
411    for &b in &bytes[1..] {
412        if !matches!(b, b'.' | b'0'..=b'6') {
413            return false;
414        }
415    }
416    true
417}
418
419/// Convert a colorspace sequence to base-space.
420///
421/// This is a direct translation of `convertColorspaceToBases()` from
422/// FastQFile.java, preserving the exact same lookup table and the behavior where
423/// encountering '.', '4', '5', or '6' causes all remaining positions to become 'N'.
424fn convert_colorspace_to_bases(s: &str) -> String {
425    let cs: Vec<u8> = s.as_bytes().to_vec();
426
427    // Java returns "" for zero-length input.
428    if cs.is_empty() {
429        return String::new();
430    }
431
432    // Output is one shorter than input (the leading reference base is consumed).
433    let mut bp = vec![0u8; cs.len() - 1];
434
435    for i in 1..cs.len() {
436        let ref_base = if i == 1 {
437            // First iteration uses cs[0] (the leading reference base).
438            cs[0]
439        } else {
440            // Subsequent iterations use the *previous output* base.
441            bp[i - 2]
442        };
443
444        // If refBase is not a valid DNA letter, Java throws
445        // IllegalArgumentException. We replicate with a panic for now, but
446        // callers should ensure valid input.
447        debug_assert!(
448            matches!(ref_base, b'G' | b'A' | b'T' | b'C'),
449            "Colorspace sequence data should always start with a real DNA letter, got '{}'",
450            ref_base as char,
451        );
452
453        // The colorspace-to-base lookup table. Each color digit
454        // encodes a transition from the reference base:
455        //   0 = same base, 1 = transversion1, 2 = transition, 3 = transversion2
456        //   '.', '4', '5', '6' = unknown -> fill rest with N
457        bp[i - 1] = match cs[i] {
458            b'0' => ref_base, // same base
459            b'1' => match ref_base {
460                b'A' => b'C',
461                b'C' => b'A',
462                b'G' => b'T',
463                b'T' => b'G',
464                _ => b'N',
465            },
466            b'2' => match ref_base {
467                b'A' => b'G',
468                b'G' => b'A',
469                b'C' => b'T',
470                b'T' => b'C',
471                _ => b'N',
472            },
473            b'3' => match ref_base {
474                b'A' => b'T',
475                b'T' => b'A',
476                b'G' => b'C',
477                b'C' => b'G',
478                _ => b'N',
479            },
480            // '.', '4', '5', '6' cause all *remaining* positions
481            // (including the current one) to be set to 'N'. Java does this with
482            // a for-loop from the current `i` to end.
483            b'.' | b'4' | b'5' | b'6' => {
484                for b in &mut bp[(i - 1)..] {
485                    *b = b'N';
486                }
487                break;
488            }
489            other => {
490                // Java throws IllegalArgumentException for unexpected chars.
491                panic!("Unexpected colorspace char '{}'", other as char);
492            }
493        };
494    }
495
496    // Safety: bp contains only ASCII DNA letters or 'N'
497    String::from_utf8(bp).expect("colorspace output should be valid UTF-8")
498}
499
500// ---------------------------------------------------------------------------
501// Tests
502// ---------------------------------------------------------------------------
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507
508    // ---- Colorspace helpers ----
509
510    #[test]
511    fn test_check_colorspace_positive() {
512        assert!(check_colorspace("G0123456"));
513        assert!(check_colorspace("A.012"));
514        assert!(check_colorspace("t00"));
515    }
516
517    #[test]
518    fn test_check_colorspace_negative() {
519        assert!(!check_colorspace("ACGTACGT"));
520        assert!(!check_colorspace("A")); // too short
521        assert!(!check_colorspace(""));
522        assert!(!check_colorspace("X012")); // invalid lead
523    }
524
525    #[test]
526    fn test_convert_colorspace_basic() {
527        // A0 -> same as A = A
528        assert_eq!(convert_colorspace_to_bases("A0"), "A");
529        // A1 -> A->C
530        assert_eq!(convert_colorspace_to_bases("A1"), "C");
531        // A2 -> A->G
532        assert_eq!(convert_colorspace_to_bases("A2"), "G");
533        // A3 -> A->T
534        assert_eq!(convert_colorspace_to_bases("A3"), "T");
535    }
536
537    #[test]
538    fn test_convert_colorspace_chained() {
539        // A00 -> A,A (ref=A->A, then ref=A->A)
540        assert_eq!(convert_colorspace_to_bases("A00"), "AA");
541        // A01 -> A, C (ref=A->A, then ref=A->C)
542        assert_eq!(convert_colorspace_to_bases("A01"), "AC");
543        // G10 -> T, T (ref=G->T, then ref=T->T)
544        assert_eq!(convert_colorspace_to_bases("G10"), "TT");
545    }
546
547    #[test]
548    fn test_convert_colorspace_unknown_fills_n() {
549        // '.' causes rest to be N
550        assert_eq!(convert_colorspace_to_bases("A.12"), "NNN");
551        // '4' also fills rest with N
552        assert_eq!(convert_colorspace_to_bases("A04"), "AN");
553    }
554
555    #[test]
556    fn test_convert_colorspace_empty() {
557        assert_eq!(convert_colorspace_to_bases(""), "");
558    }
559
560    // ---- FastQFile reading ----
561
562    #[test]
563    fn test_read_minimal_fastq() {
564        let config = FastQCConfig::default();
565        let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
566        let mut reader = FastQFile::open(&config, path).unwrap();
567
568        // Should have exactly one record
569        let seq = reader.next().unwrap().unwrap();
570        assert_eq!(seq.id, "@READ0001");
571        assert_eq!(seq.sequence, b"AAAAAAAAAAAAAAAA");
572        assert_eq!(seq.quality, b"IIIIIIIIIIIIIIII");
573        assert!(!seq.is_filtered);
574        assert!(!reader.is_colorspace());
575
576        // No more records
577        assert!(reader.next().is_none());
578    }
579
580    #[test]
581    fn test_read_complex_fastq() {
582        let config = FastQCConfig::default();
583        let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/complex.fastq");
584        let mut reader = FastQFile::open(&config, path).unwrap();
585
586        let mut count = 0;
587        while let Some(result) = reader.next() {
588            let seq = result.unwrap();
589            count += 1;
590            // All reads in complex.fastq have the same sequence and quality
591            assert_eq!(seq.sequence, b"ACGTACGTACGTACGT");
592            assert_eq!(seq.quality, b"IIIIIIIIIIIIIIII");
593            // IDs are @READ0001 through @READ0005
594            assert_eq!(seq.id, format!("@READ{:04}", count));
595        }
596        assert_eq!(count, 5);
597    }
598
599    #[test]
600    fn test_lowest_char_tracking() {
601        let config = FastQCConfig::default();
602        let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
603        let mut reader = FastQFile::open(&config, path).unwrap();
604
605        // Consume all records
606        while reader.next().is_some() {}
607
608        // 'I' is ASCII 73
609        assert_eq!(reader.lowest_char, b'I');
610    }
611
612    #[test]
613    fn test_casava_filter_detection() {
614        // We can't easily create a temp file in a unit test without extra deps,
615        // so we test the CASAVA logic by constructing a reader over a known file.
616        // The test files don't have :Y: in the ID, so nothing should be filtered.
617        let config = FastQCConfig {
618            casava: true,
619            nofilter: false,
620            ..FastQCConfig::default()
621        };
622        let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
623        let mut reader = FastQFile::open(&config, path).unwrap();
624        let seq = reader.next().unwrap().unwrap();
625        // "@READ0001" has no ":Y:", so not filtered
626        assert!(!seq.is_filtered);
627    }
628
629    #[test]
630    fn test_sequence_uppercase() {
631        // Java uppercases the sequence. Our Sequence::new does the same.
632        let seq = Sequence::new(
633            "@test".to_string(),
634            b"acgtACGT".to_vec(),
635            b"IIIIIIII".to_vec(),
636        );
637        assert_eq!(seq.sequence, b"ACGTACGT");
638    }
639}