Skip to main content

genomicframe_core/formats/fasta/
reader.rs

1//! FASTA format support (sequence data)
2//!
3//! FASTA is a text-based format for representing nucleotide or protein sequences.
4
5use crate::core::{GenomicReader, GenomicRecordIterator};
6use crate::error::Result;
7use std::fs::File;
8use std::io::{BufRead, BufReader};
9use std::path::Path;
10
11/// A single FASTA record (sequence entry)
12#[derive(Debug, Clone)]
13pub struct FastaRecord {
14    /// Sequence identifier (header line without '>')
15    pub id: String,
16    /// Optional description
17    pub description: Option<String>,
18    /// Sequence data
19    pub sequence: String,
20}
21
22impl FastaRecord {
23    /// Get the full header line
24    pub fn header(&self) -> String {
25        match &self.description {
26            Some(desc) => format!("{} {}", self.id, desc),
27            None => self.id.clone(),
28        }
29    }
30
31    /// Get sequence length
32    pub fn len(&self) -> usize {
33        self.sequence.len()
34    }
35
36    /// Check if sequence is empty
37    pub fn is_empty(&self) -> bool {
38        self.sequence.is_empty()
39    }
40}
41
42/// FASTA file reader
43pub struct FastaReader<R: BufRead> {
44    reader: R,
45    current_line: Option<String>,
46}
47
48impl FastaReader<BufReader<File>> {
49    /// Open a FASTA file from a path
50    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
51        let file = File::open(path)?;
52        let reader = BufReader::new(file);
53        Ok(Self::new(reader))
54    }
55}
56
57impl<R: BufRead> FastaReader<R> {
58    /// Create a new FASTA reader from a buffered reader
59    pub fn new(reader: R) -> Self {
60        Self {
61            reader,
62            current_line: None,
63        }
64    }
65
66    /// Peek at the next line without consuming it
67    fn peek_line(&mut self) -> Result<Option<&str>> {
68        if self.current_line.is_none() {
69            let mut line = String::new();
70            let bytes_read = self.reader.read_line(&mut line)?;
71            if bytes_read == 0 {
72                return Ok(None);
73            }
74            self.current_line = Some(line);
75        }
76        Ok(self.current_line.as_deref())
77    }
78
79    /// Consume the peeked line
80    fn consume_line(&mut self) -> Option<String> {
81        self.current_line.take()
82    }
83}
84
85impl<R: BufRead> GenomicRecordIterator for FastaReader<R> {
86    type Record = FastaRecord;
87    
88    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> { 
89        // TODO: Implement
90        Ok(None)
91    }
92
93    fn next_record(&mut self) -> Result<Option<Self::Record>> {
94        // Find the next header line
95        loop {
96            let line = match self.peek_line()? {
97                Some(line) => line.to_string(),
98                None => return Ok(None),
99            };
100
101            if line.starts_with('>') {
102                self.consume_line();
103
104                // Parse header
105                let header = line.trim_start_matches('>').trim();
106                let (id, description) = match header.split_once(' ') {
107                    Some((id, desc)) => (id.to_string(), Some(desc.to_string())),
108                    None => (header.to_string(), None),
109                };
110
111                // Read sequence lines until next header or EOF
112                let mut sequence = String::new();
113                loop {
114                    match self.peek_line()? {
115                        Some(line) if !line.starts_with('>') => {
116                            let line = self.consume_line().unwrap();
117                            sequence.push_str(line.trim());
118                        }
119                        _ => break,
120                    }
121                }
122
123                return Ok(Some(FastaRecord {
124                    id,
125                    description,
126                    sequence,
127                }));
128            } else {
129                self.consume_line();
130            }
131        }
132    }
133}
134
135impl<R: BufRead> GenomicReader for FastaReader<R> {
136    type Metadata = ();
137
138    fn metadata(&self) -> &Self::Metadata {
139        &() // No metadata for FASTA
140    }
141}
142
143#[cfg(test)]
144mod tests {
145    use super::*;
146    use std::io::Cursor;
147
148    #[test]
149    fn test_fasta_parsing() {
150        let fasta_data = ">seq1 description here\n\
151                         ACGTACGTACGT\n\
152                         ACGTACGT\n\
153                         >seq2\n\
154                         GGGGCCCC\n";
155
156        let cursor = Cursor::new(fasta_data);
157        let mut reader = FastaReader::new(cursor);
158
159        let rec1 = reader.next_record().unwrap().unwrap();
160        assert_eq!(rec1.id, "seq1");
161        assert_eq!(rec1.description, Some("description here".to_string()));
162        assert_eq!(rec1.sequence, "ACGTACGTACGTACGTACGT");
163
164        let rec2 = reader.next_record().unwrap().unwrap();
165        assert_eq!(rec2.id, "seq2");
166        assert_eq!(rec2.description, None);
167        assert_eq!(rec2.sequence, "GGGGCCCC");
168
169        assert!(reader.next_record().unwrap().is_none());
170    }
171}