Skip to main content

rustalign_io/
fasta.rs

1//! FASTA file parsing
2
3use crate::parse_dna;
4use rustalign_common::{AlignError, AlignResult, Nuc};
5use std::fs::File;
6use std::io::{BufRead, BufReader};
7use std::path::Path;
8
9/// A FASTA record
10#[derive(Debug, Clone)]
11pub struct FastaRecord {
12    /// Sequence ID (header line without >)
13    pub id: String,
14
15    /// Description (after first space in header)
16    pub desc: Option<String>,
17
18    /// Sequence data
19    pub seq: Vec<Nuc>,
20}
21
22impl FastaRecord {
23    /// Create a new FASTA record
24    pub fn new(id: String, seq: Vec<Nuc>) -> Self {
25        Self {
26            id,
27            desc: None,
28            seq,
29        }
30    }
31
32    /// Get the sequence length
33    pub fn len(&self) -> usize {
34        self.seq.len()
35    }
36
37    /// Check if sequence is empty
38    pub fn is_empty(&self) -> bool {
39        self.seq.is_empty()
40    }
41}
42
43/// FASTA file reader
44pub struct FastaReader<R: BufRead> {
45    reader: R,
46    current_id: String,
47    current_desc: Option<String>,
48    current_seq: Vec<u8>,
49}
50
51impl FastaReader<BufReader<File>> {
52    /// Open a FASTA file
53    pub fn from_path<P: AsRef<Path>>(path: P) -> AlignResult<Self> {
54        let file = File::open(path)?;
55        Ok(Self::new(BufReader::new(file)))
56    }
57}
58
59impl<R: BufRead> FastaReader<R> {
60    /// Create a new FASTA reader
61    pub fn new(reader: R) -> Self {
62        Self {
63            reader,
64            current_id: String::new(),
65            current_desc: None,
66            current_seq: Vec::new(),
67        }
68    }
69
70    /// Read the next record
71    #[allow(clippy::should_implement_trait)]
72    pub fn next(&mut self) -> AlignResult<Option<FastaRecord>> {
73        let mut line = String::new();
74
75        loop {
76            line.clear();
77            let bytes_read = self.reader.read_line(&mut line)?;
78            let trimmed = line.trim();
79
80            if bytes_read == 0 {
81                // EOF - return last record if any
82                return if self.current_id.is_empty() {
83                    Ok(None)
84                } else {
85                    Ok(Some(self.finish_record()?))
86                };
87            }
88
89            if trimmed.is_empty() {
90                continue;
91            }
92
93            if let Some(header) = trimmed.strip_prefix('>') {
94                // New header
95                if self.current_id.is_empty() {
96                    // First record - just save header
97                    if let Some((id, desc)) = header.split_once(char::is_whitespace) {
98                        self.current_id = id.to_string();
99                        self.current_desc = Some(desc.to_string());
100                    } else {
101                        self.current_id = header.to_string();
102                        self.current_desc = None;
103                    }
104                } else {
105                    // New record starting - return current record
106                    // Note: we've already consumed the new header line, so we need
107                    // to handle this differently - put it back or save for next call
108                    return Ok(Some(self.finish_record_with_header(trimmed)?));
109                }
110            } else {
111                // Sequence data
112                if self.current_id.is_empty() {
113                    return Err(AlignError::InvalidFormat(
114                        "Sequence data before header".into(),
115                    ));
116                }
117                self.current_seq.extend_from_slice(trimmed.as_bytes());
118            }
119        }
120    }
121
122    fn finish_record(&mut self) -> AlignResult<FastaRecord> {
123        let seq = parse_dna(&self.current_seq)?;
124        let id = std::mem::take(&mut self.current_id);
125        let desc = self.current_desc.take();
126        self.current_seq.clear();
127
128        Ok(FastaRecord { id, desc, seq })
129    }
130
131    fn finish_record_with_header(&mut self, next_header: &str) -> AlignResult<FastaRecord> {
132        let record = self.finish_record()?;
133
134        // Parse the next header for the subsequent call
135        let header = &next_header[1..];
136        if let Some((id, desc)) = header.split_once(char::is_whitespace) {
137            self.current_id = id.to_string();
138            self.current_desc = Some(desc.to_string());
139        } else {
140            self.current_id = header.to_string();
141            self.current_desc = None;
142        }
143
144        Ok(record)
145    }
146}
147
148/// Iterator over FASTA records
149impl<R: BufRead> Iterator for FastaReader<R> {
150    type Item = AlignResult<FastaRecord>;
151
152    fn next(&mut self) -> Option<Self::Item> {
153        self.next().transpose()
154    }
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    #[test]
162    fn test_record_new() {
163        let record = FastaRecord::new("seq1".to_string(), vec![Nuc::A, Nuc::C, Nuc::G, Nuc::T]);
164        assert_eq!(record.id, "seq1");
165        assert_eq!(record.len(), 4);
166    }
167}