Skip to main content

genomicframe_core/formats/fastq/
reader.rs

1//! FASTQ format support (sequence data with quality scores)
2//!
3//! FASTQ is a text-based format for representing nucleotide sequences
4//! with per-base quality scores. Each record consists of 4 lines:
5//! 1. Header: @ID description
6//! 2. Sequence: DNA/RNA bases
7//! 3. Separator: + (optionally followed by ID)
8//! 4. Quality: ASCII-encoded Phred scores
9//!
10//! # Design Philosophy
11//!
12//! - **Streaming by default**: Records are never buffered unless explicitly requested
13//! - **Minimal allocations**: Reuses buffers where possible
14//! - **Automatic compression detection**: Handles `.fastq`, `.fq`, `.fastq.gz`, `.fq.gz`
15//! - **Lazy evaluation**: Process millions of reads with O(1) memory
16//! - **Quality score parsing**: Phred+33 (Sanger) and Phred+64 (Illumina 1.3+) support
17//!
18//! # Examples
19//!
20//! ```no_run
21//! use genomicframe_core::formats::fastq::FastqReader;
22//! use genomicframe_core::core::GenomicRecordIterator;
23//!
24//! // Streaming: O(1) memory, processes one read at a time
25//! let mut reader = FastqReader::from_path("reads.fastq.gz")?;
26//!
27//! while let Some(record) = reader.next_record()? {
28//!     let mean_quality = record.mean_quality();
29//!     if mean_quality >= 30.0 {
30//!         println!("High quality read: {}", record.id);
31//!     }
32//! }
33//! # Ok::<(), genomicframe_core::error::Error>(())
34//! ```
35
36use crate::core::{GenomicReader, GenomicRecordIterator};
37use crate::error::{Error, Result};
38use crate::io::Compression;
39use flate2::read::MultiGzDecoder;
40use std::fs::File;
41use std::io::{BufRead, BufReader};
42use std::path::Path;
43
44/// Quality score encoding scheme
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum QualityEncoding {
47    /// Phred+33 (Sanger, Illumina 1.8+) - standard modern format
48    Phred33,
49    /// Phred+64 (Illumina 1.3-1.7) - legacy format
50    Phred64,
51}
52
53impl QualityEncoding {
54    /// Get the ASCII offset for this encoding
55    pub fn offset(&self) -> u8 {
56        match self {
57            QualityEncoding::Phred33 => 33,
58            QualityEncoding::Phred64 => 64,
59        }
60    }
61
62    /// Detect encoding from quality string (heuristic)
63    pub fn detect(quality: &str) -> Self {
64        // Check for characters that only appear in Phred+64
65        // Phred+33: ! to ~  (33-126)
66        // Phred+64: @ to ~  (64-126)
67        // If we see characters < 64, it must be Phred+33
68        for byte in quality.bytes() {
69            if byte < 64 {
70                return QualityEncoding::Phred33;
71            }
72        }
73        // Default to Phred+33 (modern standard)
74        QualityEncoding::Phred33
75    }
76}
77
78/// A single FASTQ record (DNA/RNA sequence with quality scores)
79#[derive(Debug, Clone)]
80pub struct FastqRecord {
81    /// Sequence identifier (header line without '@')
82    pub id: String,
83    /// Optional description (text after first whitespace in header)
84    pub description: Option<String>,
85    /// Sequence data (DNA/RNA bases)
86    pub sequence: String,
87    /// Quality scores (ASCII-encoded Phred scores)
88    pub quality: String,
89}
90
91impl FastqRecord {
92    /// Get sequence length (number of bases)
93    pub fn len(&self) -> usize {
94        self.sequence.len()
95    }
96
97    /// Check if sequence is empty
98    pub fn is_empty(&self) -> bool {
99        self.sequence.is_empty()
100    }
101
102    /// Convert quality string to Phred scores
103    pub fn phred_scores(&self, encoding: QualityEncoding) -> Vec<u8> {
104        let offset = encoding.offset();
105        self.quality
106            .bytes()
107            .map(|b| b.saturating_sub(offset))
108            .collect()
109    }
110
111    /// Get mean quality score
112    pub fn mean_quality(&self) -> f64 {
113        self.mean_quality_with_encoding(QualityEncoding::Phred33)
114    }
115
116    /// Get mean quality score with specific encoding
117    pub fn mean_quality_with_encoding(&self, encoding: QualityEncoding) -> f64 {
118        let scores = self.phred_scores(encoding);
119        if scores.is_empty() {
120            return 0.0;
121        }
122        let sum: u32 = scores.iter().map(|&s| s as u32).sum();
123        sum as f64 / scores.len() as f64
124    }
125
126    /// Check if sequence and quality have matching lengths
127    pub fn is_valid(&self) -> bool {
128        self.sequence.len() == self.quality.len()
129    }
130
131    /// Get GC content as a fraction (0.0 to 1.0)
132    pub fn gc_content(&self) -> f64 {
133        let gc_count = self
134            .sequence
135            .bytes()
136            .filter(|&b| b == b'G' || b == b'C' || b == b'g' || b == b'c')
137            .count();
138        if self.sequence.is_empty() {
139            0.0
140        } else {
141            gc_count as f64 / self.sequence.len() as f64
142        }
143    }
144
145    /// Get N content (ambiguous bases) as a fraction (0.0 to 1.0)
146    pub fn n_content(&self) -> f64 {
147        let n_count = self
148            .sequence
149            .bytes()
150            .filter(|&b| b == b'N' || b == b'n')
151            .count();
152        if self.sequence.is_empty() {
153            0.0
154        } else {
155            n_count as f64 / self.sequence.len() as f64
156        }
157    }
158}
159
160/// FASTQ header metadata (minimal - FASTQ has no standard header)
161#[derive(Debug, Clone, Default)]
162pub struct FastqHeader {
163    /// Detected quality encoding (set after reading first record)
164    pub encoding: Option<QualityEncoding>,
165}
166
167/// FASTQ file reader (streaming, memory-efficient)
168///
169/// Create using `FastqReader::from_path()` or `FastqReader::from_buffer()`.
170///
171/// FASTQ format:
172/// - Line 1: @ID description
173/// - Line 2: Sequence
174/// - Line 3: + (optional ID)
175/// - Line 4: Quality
176pub struct FastqReader {
177    reader: Box<dyn BufRead>,
178    header: FastqHeader,
179    line_number: usize,
180    // Reusable buffers to avoid allocations
181    id_buffer: String,
182    seq_buffer: String,
183    sep_buffer: String,
184    qual_buffer: String,
185}
186
187impl FastqReader {
188    /// Create a FASTQ reader from a buffered reader
189    ///
190    /// This is the low-level constructor that accepts any buffered reader.
191    /// Use this when you already have a BufRead source (e.g., from seeking
192    /// to a specific position in a file for parallel processing).
193    ///
194    /// # Example
195    ///
196    /// ```no_run
197    /// use genomicframe_core::formats::fastq::FastqReader;
198    /// use std::io::BufReader;
199    /// use std::fs::File;
200    ///
201    /// let file = File::open("reads.fastq")?;
202    /// let buf_reader = BufReader::new(file);
203    /// let mut reader = FastqReader::from_buffer(buf_reader);
204    /// # Ok::<(), genomicframe_core::error::Error>(())
205    /// ```
206    pub fn from_buffer<R: BufRead + 'static>(reader: R) -> Self {
207        Self {
208            reader: Box::new(reader),
209            header: FastqHeader::default(),
210            line_number: 0,
211            // Pre-allocate buffers with reasonable capacity (typical read ~150bp)
212            id_buffer: String::with_capacity(256),
213            seq_buffer: String::with_capacity(256),
214            sep_buffer: String::with_capacity(4),
215            qual_buffer: String::with_capacity(256),
216        }
217    }
218
219    /// Open a FASTQ file - handles .fastq, .fq, .fastq.gz, .fq.gz automatically
220    ///
221    /// # Memory Behavior
222    ///
223    /// - Opens file handle (~8KB buffer)
224    /// - **Does NOT load reads into memory**
225    /// - Records are parsed on-demand as you iterate
226    ///
227    /// # Example
228    ///
229    /// ```no_run
230    /// use genomicframe_core::formats::fastq::FastqReader;
231    /// use genomicframe_core::core::GenomicRecordIterator;
232    ///
233    /// // Works with multi-GB files using only ~10KB RAM
234    /// let mut reader = FastqReader::from_path("reads.fastq.gz")?;
235    ///
236    /// while let Some(record) = reader.next_record()? {
237    ///     if record.mean_quality() >= 30.0 {
238    ///         println!("High quality: {}", record.id);
239    ///     }
240    /// }
241    /// # Ok::<(), genomicframe_core::error::Error>(())
242    /// ```
243    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
244        let path = path.as_ref();
245        let file = File::open(path)?;
246        let compression = Compression::from_path(path);
247
248        let reader: Box<dyn BufRead> = match compression {
249            Compression::Gzip | Compression::Bgzip => {
250                Box::new(BufReader::new(MultiGzDecoder::new(file)))
251            }
252            _ => Box::new(BufReader::new(file)),
253        };
254
255        Ok(Self {
256            reader,
257            header: FastqHeader::default(),
258            line_number: 0,
259            // Pre-allocate buffers with reasonable capacity (typical read ~150bp)
260            id_buffer: String::with_capacity(256),
261            seq_buffer: String::with_capacity(256),
262            sep_buffer: String::with_capacity(4),
263            qual_buffer: String::with_capacity(256),
264        })
265    }
266
267    /// Get the FASTQ header metadata
268    pub fn header(&self) -> &FastqHeader {
269        &self.header
270    }
271
272
273    /// Parse a FASTQ record (4 lines)
274    fn parse_record(&mut self) -> Result<Option<FastqRecord>> {
275        // Line 1: Header (@ID description)
276        self.id_buffer.clear();
277        let bytes = self.reader.read_line(&mut self.id_buffer)?;
278        if bytes > 0 {
279            self.line_number += 1;
280        } else {
281            return Ok(None); // End of file
282        }
283
284        // Trim in-place by finding actual content length (strip trailing whitespace)
285        let id_line = self.id_buffer.trim();
286        if !id_line.starts_with('@') {
287            return Err(Error::Parse(format!(
288                "Line {}: Expected '@' at start of FASTQ record, got: {}",
289                self.line_number,
290                id_line.chars().take(50).collect::<String>()
291            )));
292        }
293
294        // Parse ID and description
295        let header_content = &id_line[1..]; // Skip '@'
296        let (id, description) = if let Some((id, desc)) = header_content.split_once(char::is_whitespace) {
297            (id.to_string(), Some(desc.trim().to_string()))
298        } else {
299            (header_content.to_string(), None)
300        };
301
302        // Line 2: Sequence
303        self.seq_buffer.clear();
304        let bytes = self.reader.read_line(&mut self.seq_buffer)?;
305        if bytes > 0 {
306            self.line_number += 1;
307        } else {
308            return Err(Error::Parse(format!(
309                "Line {}: Unexpected end of file while reading sequence",
310                self.line_number
311            )));
312        }
313        let sequence = self.seq_buffer.trim().to_string();
314
315        // Line 3: Separator (+ optional ID)
316        self.sep_buffer.clear();
317        let bytes = self.reader.read_line(&mut self.sep_buffer)?;
318        if bytes > 0 {
319            self.line_number += 1;
320        } else {
321            return Err(Error::Parse(format!(
322                "Line {}: Unexpected end of file while reading separator",
323                self.line_number
324            )));
325        }
326        let separator = self.sep_buffer.trim();
327        if !separator.starts_with('+') {
328            return Err(Error::Parse(format!(
329                "Line {}: Expected '+' separator, got: {}",
330                self.line_number, separator
331            )));
332        }
333
334        // Line 4: Quality
335        self.qual_buffer.clear();
336        let bytes = self.reader.read_line(&mut self.qual_buffer)?;
337        if bytes > 0 {
338            self.line_number += 1;
339        } else {
340            return Err(Error::Parse(format!(
341                "Line {}: Unexpected end of file while reading quality",
342                self.line_number
343            )));
344        }
345        let quality = self.qual_buffer.trim().to_string();
346
347        // Validate sequence and quality lengths match
348        if sequence.len() != quality.len() {
349            return Err(Error::Parse(format!(
350                "Line {}: Sequence length ({}) does not match quality length ({})",
351                self.line_number,
352                sequence.len(),
353                quality.len()
354            )));
355        }
356
357        // Auto-detect quality encoding from first record
358        if self.header.encoding.is_none() && !quality.is_empty() {
359            self.header.encoding = Some(QualityEncoding::detect(&quality));
360        }
361
362        Ok(Some(FastqRecord {
363            id,
364            description,
365            sequence,
366            quality,
367        }))
368    }
369}
370
371impl GenomicRecordIterator for FastqReader {
372    type Record = FastqRecord;
373
374    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> { 
375        // TODO: Implement
376        Ok(None)
377    }
378
379    fn next_record(&mut self) -> Result<Option<Self::Record>> {
380        self.parse_record()
381    }
382}
383
384impl GenomicReader for FastqReader {
385    type Metadata = FastqHeader;
386
387    fn metadata(&self) -> &Self::Metadata {
388        &self.header
389    }
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395    use std::io::Write;
396    use tempfile::NamedTempFile;
397
398    #[test]
399    fn test_fastq_basic_read() -> Result<()> {
400        let fastq_data = "@READ1 description here\n\
401                          ACGTACGTACGT\n\
402                          +\n\
403                          IIIIIIIIIIII\n\
404                          @READ2\n\
405                          GGGGCCCCAAAA\n\
406                          +READ2\n\
407                          HHHHHHHHHHHH\n";
408
409        let mut temp_file = NamedTempFile::new()?;
410        temp_file.write_all(fastq_data.as_bytes())?;
411        temp_file.flush()?;
412
413        let mut reader = FastqReader::from_path(temp_file.path())?;
414
415        // First record
416        let record1 = reader.next_record()?.expect("Should have first record");
417        assert_eq!(record1.id, "READ1");
418        assert_eq!(record1.description, Some("description here".to_string()));
419        assert_eq!(record1.sequence, "ACGTACGTACGT");
420        assert_eq!(record1.quality, "IIIIIIIIIIII");
421        assert_eq!(record1.len(), 12);
422        assert!(record1.is_valid());
423
424        // Second record
425        let record2 = reader.next_record()?.expect("Should have second record");
426        assert_eq!(record2.id, "READ2");
427        assert_eq!(record2.description, None);
428        assert_eq!(record2.sequence, "GGGGCCCCAAAA");
429        assert_eq!(record2.quality, "HHHHHHHHHHHH");
430
431        // End of file
432        assert!(reader.next_record()?.is_none());
433
434        Ok(())
435    }
436
437    #[test]
438    fn test_phred_scores() {
439        let record = FastqRecord {
440            id: "test".to_string(),
441            description: None,
442            sequence: "ACGT".to_string(),
443            quality: "!5IA".to_string(), // Phred+33: 0, 20, 40, 32
444        };
445
446        let scores = record.phred_scores(QualityEncoding::Phred33);
447        assert_eq!(scores, vec![0, 20, 40, 32]);
448
449        let mean = record.mean_quality();
450        assert!((mean - 23.0).abs() < 0.1);
451    }
452
453    #[test]
454    fn test_gc_content() {
455        let record = FastqRecord {
456            id: "test".to_string(),
457            description: None,
458            sequence: "ACGTACGT".to_string(), // 4 GC out of 8 = 50%
459            quality: "IIIIIIII".to_string(),
460        };
461
462        assert_eq!(record.gc_content(), 0.5);
463    }
464
465    #[test]
466    fn test_invalid_record() {
467        let fastq_data = "@READ1\n\
468                          ACGT\n\
469                          +\n\
470                          III\n"; // Quality too short!
471
472        let mut temp_file = NamedTempFile::new().unwrap();
473        temp_file.write_all(fastq_data.as_bytes()).unwrap();
474        temp_file.flush().unwrap();
475
476        let mut reader = FastqReader::from_path(temp_file.path()).unwrap();
477        let result = reader.next_record();
478        assert!(result.is_err());
479    }
480
481    #[test]
482    fn test_quality_encoding_detection() {
483        // Phred+33 has characters < 64
484        assert_eq!(
485            QualityEncoding::detect("!5IA"),
486            QualityEncoding::Phred33
487        );
488
489        // Phred+64 has no characters < 64
490        assert_eq!(
491            QualityEncoding::detect("@@@@"),
492            QualityEncoding::Phred33 // Defaults to Phred33
493        );
494    }
495
496    #[test]
497    fn test_missing_separator() {
498        let fastq_data = "@READ1\n\
499                          ACGT\n\
500                          -\n\
501                          IIII\n"; // Wrong separator!
502
503        let mut temp_file = NamedTempFile::new().unwrap();
504        temp_file.write_all(fastq_data.as_bytes()).unwrap();
505        temp_file.flush().unwrap();
506
507        let mut reader = FastqReader::from_path(temp_file.path()).unwrap();
508        let result = reader.next_record();
509        assert!(result.is_err());
510    }
511}
512