use std::io::BufRead;
use crate::core::CoreError;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FastqRecord {
pub name: String,
pub sequence: Vec<u8>,
pub qualities: Vec<u8>,
}
#[derive(Debug)]
pub struct FastqReader<R: BufRead> {
reader: R,
line: String,
}
impl<R: BufRead> FastqReader<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
line: String::new(),
}
}
fn next_nonblank(&mut self) -> Result<Option<String>, CoreError> {
loop {
self.line.clear();
let n = self.reader.read_line(&mut self.line)?;
if n == 0 {
return Ok(None);
}
let trimmed = self.line.trim();
if !trimmed.is_empty() {
return Ok(Some(trimmed.to_string()));
}
}
}
fn next_required(&mut self, ctx: &str) -> Result<String, CoreError> {
self.line.clear();
let n = self.reader.read_line(&mut self.line)?;
if n == 0 {
return Err(CoreError::MalformedRecord(format!(
"unexpected end of FASTQ while reading {ctx}"
)));
}
Ok(self.line.trim().to_string())
}
}
impl<R: BufRead> Iterator for FastqReader<R> {
type Item = Result<FastqRecord, CoreError>;
fn next(&mut self) -> Option<Self::Item> {
let header = match self.next_nonblank() {
Ok(Some(h)) => h,
Ok(None) => return None, Err(e) => return Some(Err(e)),
};
let name = match header.strip_prefix('@') {
Some(rest) => match rest.split_whitespace().next() {
Some(n) => n.to_string(),
None => {
return Some(Err(CoreError::MalformedRecord(
"FASTQ header has no read name".to_string(),
)))
}
},
None => {
return Some(Err(CoreError::MalformedRecord(format!(
"expected FASTQ header starting with '@', found '{header}'"
))))
}
};
let sequence = match self.next_required("sequence") {
Ok(s) => s.to_ascii_uppercase().into_bytes(),
Err(e) => return Some(Err(e)),
};
let plus = match self.next_required("'+' separator") {
Ok(s) => s,
Err(e) => return Some(Err(e)),
};
if !plus.starts_with('+') {
return Some(Err(CoreError::MalformedRecord(format!(
"expected '+' separator for read '{name}', found '{plus}'"
))));
}
let qualities = match self.next_required("qualities") {
Ok(s) => s.into_bytes(),
Err(e) => return Some(Err(e)),
};
if sequence.len() != qualities.len() {
return Some(Err(CoreError::MalformedRecord(format!(
"sequence/quality length mismatch for read '{name}' ({} vs {})",
sequence.len(),
qualities.len()
))));
}
Some(Ok(FastqRecord {
name,
sequence,
qualities,
}))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
fn read_all(input: &str) -> Result<Vec<FastqRecord>, CoreError> {
FastqReader::new(Cursor::new(input.as_bytes().to_vec())).collect()
}
#[test]
fn reads_two_records_and_trims_name_after_space() {
let recs = read_all("@read1 1:N:0:CG\nacgt\n+\nIIII\n@read2\nTT\n+\n##\n").unwrap();
assert_eq!(recs.len(), 2);
assert_eq!(recs[0].name, "read1");
assert_eq!(recs[0].sequence, b"ACGT");
assert_eq!(recs[0].qualities, b"IIII");
assert_eq!(recs[1].name, "read2");
}
#[test]
fn missing_at_prefix_is_malformed() {
let err = read_all("read1\nACGT\n+\nIIII\n").unwrap_err();
assert!(matches!(err, CoreError::MalformedRecord(_)));
}
#[test]
fn missing_plus_separator_is_malformed() {
let err = read_all("@read1\nACGT\n-\nIIII\n").unwrap_err();
assert!(matches!(err, CoreError::MalformedRecord(_)));
}
#[test]
fn seq_qual_length_mismatch_is_malformed() {
let err = read_all("@read1\nACGT\n+\nII\n").unwrap_err();
assert!(matches!(err, CoreError::MalformedRecord(_)));
}
#[test]
fn truncated_record_at_eof_is_malformed() {
let err = read_all("@read1\nACGT\n").unwrap_err();
assert!(matches!(err, CoreError::MalformedRecord(_)));
}
#[test]
fn empty_input_yields_no_records() {
assert!(read_all("").unwrap().is_empty());
}
#[test]
fn quality_line_starting_with_at_is_not_a_header() {
let recs = read_all("@r1\nACGT\n+\n@III\n").unwrap();
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].qualities, b"@III");
}
}