genomicframe_core/formats/fastq/
reader.rs1use crate::core::{GenomicReader, GenomicRecordIterator};
37use crate::error::{Error, Result};
38use crate::io::Compression;
39use flate2::read::MultiGzDecoder;
40use std::fs::File;
41use std::io::{BufRead, BufReader};
42use std::path::Path;
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum QualityEncoding {
47 Phred33,
49 Phred64,
51}
52
53impl QualityEncoding {
54 pub fn offset(&self) -> u8 {
56 match self {
57 QualityEncoding::Phred33 => 33,
58 QualityEncoding::Phred64 => 64,
59 }
60 }
61
62 pub fn detect(quality: &str) -> Self {
64 for byte in quality.bytes() {
69 if byte < 64 {
70 return QualityEncoding::Phred33;
71 }
72 }
73 QualityEncoding::Phred33
75 }
76}
77
78#[derive(Debug, Clone)]
80pub struct FastqRecord {
81 pub id: String,
83 pub description: Option<String>,
85 pub sequence: String,
87 pub quality: String,
89}
90
91impl FastqRecord {
92 pub fn len(&self) -> usize {
94 self.sequence.len()
95 }
96
97 pub fn is_empty(&self) -> bool {
99 self.sequence.is_empty()
100 }
101
102 pub fn phred_scores(&self, encoding: QualityEncoding) -> Vec<u8> {
104 let offset = encoding.offset();
105 self.quality
106 .bytes()
107 .map(|b| b.saturating_sub(offset))
108 .collect()
109 }
110
111 pub fn mean_quality(&self) -> f64 {
113 self.mean_quality_with_encoding(QualityEncoding::Phred33)
114 }
115
116 pub fn mean_quality_with_encoding(&self, encoding: QualityEncoding) -> f64 {
118 let scores = self.phred_scores(encoding);
119 if scores.is_empty() {
120 return 0.0;
121 }
122 let sum: u32 = scores.iter().map(|&s| s as u32).sum();
123 sum as f64 / scores.len() as f64
124 }
125
126 pub fn is_valid(&self) -> bool {
128 self.sequence.len() == self.quality.len()
129 }
130
131 pub fn gc_content(&self) -> f64 {
133 let gc_count = self
134 .sequence
135 .bytes()
136 .filter(|&b| b == b'G' || b == b'C' || b == b'g' || b == b'c')
137 .count();
138 if self.sequence.is_empty() {
139 0.0
140 } else {
141 gc_count as f64 / self.sequence.len() as f64
142 }
143 }
144
145 pub fn n_content(&self) -> f64 {
147 let n_count = self
148 .sequence
149 .bytes()
150 .filter(|&b| b == b'N' || b == b'n')
151 .count();
152 if self.sequence.is_empty() {
153 0.0
154 } else {
155 n_count as f64 / self.sequence.len() as f64
156 }
157 }
158}
159
160#[derive(Debug, Clone, Default)]
162pub struct FastqHeader {
163 pub encoding: Option<QualityEncoding>,
165}
166
167pub struct FastqReader {
177 reader: Box<dyn BufRead>,
178 header: FastqHeader,
179 line_number: usize,
180 id_buffer: String,
182 seq_buffer: String,
183 sep_buffer: String,
184 qual_buffer: String,
185}
186
187impl FastqReader {
188 pub fn from_buffer<R: BufRead + 'static>(reader: R) -> Self {
207 Self {
208 reader: Box::new(reader),
209 header: FastqHeader::default(),
210 line_number: 0,
211 id_buffer: String::with_capacity(256),
213 seq_buffer: String::with_capacity(256),
214 sep_buffer: String::with_capacity(4),
215 qual_buffer: String::with_capacity(256),
216 }
217 }
218
219 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
244 let path = path.as_ref();
245 let file = File::open(path)?;
246 let compression = Compression::from_path(path);
247
248 let reader: Box<dyn BufRead> = match compression {
249 Compression::Gzip | Compression::Bgzip => {
250 Box::new(BufReader::new(MultiGzDecoder::new(file)))
251 }
252 _ => Box::new(BufReader::new(file)),
253 };
254
255 Ok(Self {
256 reader,
257 header: FastqHeader::default(),
258 line_number: 0,
259 id_buffer: String::with_capacity(256),
261 seq_buffer: String::with_capacity(256),
262 sep_buffer: String::with_capacity(4),
263 qual_buffer: String::with_capacity(256),
264 })
265 }
266
267 pub fn header(&self) -> &FastqHeader {
269 &self.header
270 }
271
272
273 fn parse_record(&mut self) -> Result<Option<FastqRecord>> {
275 self.id_buffer.clear();
277 let bytes = self.reader.read_line(&mut self.id_buffer)?;
278 if bytes > 0 {
279 self.line_number += 1;
280 } else {
281 return Ok(None); }
283
284 let id_line = self.id_buffer.trim();
286 if !id_line.starts_with('@') {
287 return Err(Error::Parse(format!(
288 "Line {}: Expected '@' at start of FASTQ record, got: {}",
289 self.line_number,
290 id_line.chars().take(50).collect::<String>()
291 )));
292 }
293
294 let header_content = &id_line[1..]; let (id, description) = if let Some((id, desc)) = header_content.split_once(char::is_whitespace) {
297 (id.to_string(), Some(desc.trim().to_string()))
298 } else {
299 (header_content.to_string(), None)
300 };
301
302 self.seq_buffer.clear();
304 let bytes = self.reader.read_line(&mut self.seq_buffer)?;
305 if bytes > 0 {
306 self.line_number += 1;
307 } else {
308 return Err(Error::Parse(format!(
309 "Line {}: Unexpected end of file while reading sequence",
310 self.line_number
311 )));
312 }
313 let sequence = self.seq_buffer.trim().to_string();
314
315 self.sep_buffer.clear();
317 let bytes = self.reader.read_line(&mut self.sep_buffer)?;
318 if bytes > 0 {
319 self.line_number += 1;
320 } else {
321 return Err(Error::Parse(format!(
322 "Line {}: Unexpected end of file while reading separator",
323 self.line_number
324 )));
325 }
326 let separator = self.sep_buffer.trim();
327 if !separator.starts_with('+') {
328 return Err(Error::Parse(format!(
329 "Line {}: Expected '+' separator, got: {}",
330 self.line_number, separator
331 )));
332 }
333
334 self.qual_buffer.clear();
336 let bytes = self.reader.read_line(&mut self.qual_buffer)?;
337 if bytes > 0 {
338 self.line_number += 1;
339 } else {
340 return Err(Error::Parse(format!(
341 "Line {}: Unexpected end of file while reading quality",
342 self.line_number
343 )));
344 }
345 let quality = self.qual_buffer.trim().to_string();
346
347 if sequence.len() != quality.len() {
349 return Err(Error::Parse(format!(
350 "Line {}: Sequence length ({}) does not match quality length ({})",
351 self.line_number,
352 sequence.len(),
353 quality.len()
354 )));
355 }
356
357 if self.header.encoding.is_none() && !quality.is_empty() {
359 self.header.encoding = Some(QualityEncoding::detect(&quality));
360 }
361
362 Ok(Some(FastqRecord {
363 id,
364 description,
365 sequence,
366 quality,
367 }))
368 }
369}
370
371impl GenomicRecordIterator for FastqReader {
372 type Record = FastqRecord;
373
374 fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
375 Ok(None)
377 }
378
379 fn next_record(&mut self) -> Result<Option<Self::Record>> {
380 self.parse_record()
381 }
382}
383
384impl GenomicReader for FastqReader {
385 type Metadata = FastqHeader;
386
387 fn metadata(&self) -> &Self::Metadata {
388 &self.header
389 }
390}
391
392#[cfg(test)]
393mod tests {
394 use super::*;
395 use std::io::Write;
396 use tempfile::NamedTempFile;
397
398 #[test]
399 fn test_fastq_basic_read() -> Result<()> {
400 let fastq_data = "@READ1 description here\n\
401 ACGTACGTACGT\n\
402 +\n\
403 IIIIIIIIIIII\n\
404 @READ2\n\
405 GGGGCCCCAAAA\n\
406 +READ2\n\
407 HHHHHHHHHHHH\n";
408
409 let mut temp_file = NamedTempFile::new()?;
410 temp_file.write_all(fastq_data.as_bytes())?;
411 temp_file.flush()?;
412
413 let mut reader = FastqReader::from_path(temp_file.path())?;
414
415 let record1 = reader.next_record()?.expect("Should have first record");
417 assert_eq!(record1.id, "READ1");
418 assert_eq!(record1.description, Some("description here".to_string()));
419 assert_eq!(record1.sequence, "ACGTACGTACGT");
420 assert_eq!(record1.quality, "IIIIIIIIIIII");
421 assert_eq!(record1.len(), 12);
422 assert!(record1.is_valid());
423
424 let record2 = reader.next_record()?.expect("Should have second record");
426 assert_eq!(record2.id, "READ2");
427 assert_eq!(record2.description, None);
428 assert_eq!(record2.sequence, "GGGGCCCCAAAA");
429 assert_eq!(record2.quality, "HHHHHHHHHHHH");
430
431 assert!(reader.next_record()?.is_none());
433
434 Ok(())
435 }
436
437 #[test]
438 fn test_phred_scores() {
439 let record = FastqRecord {
440 id: "test".to_string(),
441 description: None,
442 sequence: "ACGT".to_string(),
443 quality: "!5IA".to_string(), };
445
446 let scores = record.phred_scores(QualityEncoding::Phred33);
447 assert_eq!(scores, vec![0, 20, 40, 32]);
448
449 let mean = record.mean_quality();
450 assert!((mean - 23.0).abs() < 0.1);
451 }
452
453 #[test]
454 fn test_gc_content() {
455 let record = FastqRecord {
456 id: "test".to_string(),
457 description: None,
458 sequence: "ACGTACGT".to_string(), quality: "IIIIIIII".to_string(),
460 };
461
462 assert_eq!(record.gc_content(), 0.5);
463 }
464
465 #[test]
466 fn test_invalid_record() {
467 let fastq_data = "@READ1\n\
468 ACGT\n\
469 +\n\
470 III\n"; let mut temp_file = NamedTempFile::new().unwrap();
473 temp_file.write_all(fastq_data.as_bytes()).unwrap();
474 temp_file.flush().unwrap();
475
476 let mut reader = FastqReader::from_path(temp_file.path()).unwrap();
477 let result = reader.next_record();
478 assert!(result.is_err());
479 }
480
481 #[test]
482 fn test_quality_encoding_detection() {
483 assert_eq!(
485 QualityEncoding::detect("!5IA"),
486 QualityEncoding::Phred33
487 );
488
489 assert_eq!(
491 QualityEncoding::detect("@@@@"),
492 QualityEncoding::Phred33 );
494 }
495
496 #[test]
497 fn test_missing_separator() {
498 let fastq_data = "@READ1\n\
499 ACGT\n\
500 -\n\
501 IIII\n"; let mut temp_file = NamedTempFile::new().unwrap();
504 temp_file.write_all(fastq_data.as_bytes()).unwrap();
505 temp_file.flush().unwrap();
506
507 let mut reader = FastqReader::from_path(temp_file.path()).unwrap();
508 let result = reader.next_record();
509 assert!(result.is_err());
510 }
511}
512