use std::io::BufRead;
use crate::core::CoreError;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FastaRecord {
pub name: String,
pub sequence: Vec<u8>,
}
#[derive(Debug)]
pub struct FastaReader<R: BufRead> {
reader: R,
pending_header: Option<String>,
line: String,
}
impl<R: BufRead> FastaReader<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
pending_header: None,
line: String::new(),
}
}
}
impl<R: BufRead> Iterator for FastaReader<R> {
type Item = Result<FastaRecord, CoreError>;
fn next(&mut self) -> Option<Self::Item> {
let header = match self.pending_header.take() {
Some(h) => h,
None => loop {
self.line.clear();
match self.reader.read_line(&mut self.line) {
Ok(0) => return None, Ok(_) => {}
Err(e) => return Some(Err(CoreError::Io(e))),
}
let trimmed = self.line.trim();
if trimmed.is_empty() {
continue;
}
match trimmed.strip_prefix('>') {
Some(rest) => break rest.to_string(),
None => {
return Some(Err(CoreError::MalformedRecord(format!(
"expected FASTA header starting with '>', found '{trimmed}'"
))))
}
}
},
};
let name = match header.split_whitespace().next() {
Some(n) => n.to_string(),
None => {
return Some(Err(CoreError::MalformedRecord(
"FASTA header has no name".to_string(),
)))
}
};
let mut sequence = Vec::new();
loop {
self.line.clear();
match self.reader.read_line(&mut self.line) {
Ok(0) => break, Ok(_) => {}
Err(e) => return Some(Err(CoreError::Io(e))),
}
let trimmed = self.line.trim();
if trimmed.is_empty() {
continue;
}
if let Some(rest) = trimmed.strip_prefix('>') {
self.pending_header = Some(rest.to_string());
break;
}
sequence.extend(trimmed.bytes().map(|b| b.to_ascii_uppercase()));
}
if sequence.is_empty() {
return Some(Err(CoreError::MalformedRecord(format!(
"FASTA record '{name}' has no sequence data"
))));
}
Some(Ok(FastaRecord { name, sequence }))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::ContigSet;
use std::io::Cursor;
fn read_all(input: &str) -> Vec<FastaRecord> {
FastaReader::new(Cursor::new(input.as_bytes().to_vec()))
.collect::<Result<Vec<_>, _>>()
.expect("FASTA should parse")
}
#[test]
fn reads_single_record_uppercased() {
let recs = read_all(">chr1 some description\nacgt\nACGT\n");
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].name, "chr1");
assert_eq!(recs[0].sequence, b"ACGTACGT");
}
#[test]
fn reads_three_records_and_builds_a_three_contig_set() {
let recs = read_all(">chr1\nACGT\n>chr2\nAACCGGTT\n>chr3\nGG\n");
assert_eq!(recs.len(), 3);
let mut contigs = ContigSet::new();
for r in &recs {
contigs.push(r.name.clone(), r.sequence.len() as u32);
}
assert_eq!(contigs.len(), 3);
assert_eq!(contigs.by_name("chr2").unwrap().id, 1);
assert_eq!(contigs.by_name("chr3").unwrap().global_offset, 12);
}
#[test]
fn skips_blank_lines_between_records() {
let recs = read_all("\n>chr1\nAC\n\nGT\n\n>chr2\nTT\n");
assert_eq!(recs.len(), 2);
assert_eq!(recs[0].sequence, b"ACGT");
assert_eq!(recs[1].sequence, b"TT");
}
#[test]
fn missing_header_is_a_malformed_record_error() {
let err = FastaReader::new(Cursor::new(b"ACGT\n".to_vec()))
.next()
.unwrap()
.unwrap_err();
assert!(matches!(err, CoreError::MalformedRecord(_)));
}
#[test]
fn header_without_sequence_is_a_malformed_record_error() {
let err = FastaReader::new(Cursor::new(b">chr1\n".to_vec()))
.next()
.unwrap()
.unwrap_err();
assert!(matches!(err, CoreError::MalformedRecord(_)));
}
#[test]
fn empty_input_yields_no_records() {
assert!(read_all("").is_empty());
}
}