noodles_fasta/io/
indexer.rs

1//! FASTA indexer.
2
3use std::{
4    error::Error,
5    fmt,
6    io::{self, BufRead},
7};
8
9use memchr::memchr;
10
11use super::reader::{DEFINITION_PREFIX, read_line};
12use crate::{
13    fai::Record,
14    record::definition::{Definition, ParseError},
15};
16
17/// A FASTA indexer.
18pub struct Indexer<R> {
19    inner: R,
20    offset: u64,
21}
22
23impl<R> Indexer<R>
24where
25    R: BufRead,
26{
27    /// Creates a FASTA indexer.
28    ///
29    /// # Examples
30    ///
31    /// ```
32    /// # use std::io;
33    /// use noodles_fasta as fasta;
34    /// let mut indexer = fasta::io::Indexer::new(io::empty());
35    /// ```
36    pub fn new(inner: R) -> Self {
37        Self { inner, offset: 0 }
38    }
39
40    /// Consumes a single sequence line.
41    ///
42    /// If successful, this returns the number of bytes read from the stream (i.e., the line width)
43    /// and the number of bases in the line. If the number of bytes read is 0, the entire sequence
44    /// of the current record was read.
45    fn consume_sequence_line(&mut self) -> io::Result<(usize, usize)> {
46        consume_sequence_line(&mut self.inner)
47    }
48
49    /// Indexes a raw FASTA record.
50    ///
51    /// The position of the stream is expected to be at the start or at the start of another
52    /// definition.
53    ///
54    /// # Errors
55    ///
56    /// An error is returned if the record fails to be completely read. This includes when
57    ///
58    ///   * the stream is not at the start of a definition;
59    ///   * the record is missing a sequence;
60    ///   * the sequence lines have a different number of bases, excluding the last line;
61    ///   * or the sequence lines are not the same length, excluding the last line.
62    ///
63    /// # Examples
64    ///
65    /// ```
66    /// # use std::io;
67    /// use noodles_fasta::{self as fasta, fai};
68    ///
69    /// let src = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
70    /// let mut indexer = fasta::io::Indexer::new(&src[..]);
71    ///
72    /// let mut records = Vec::new();
73    ///
74    /// while let Some(record) = indexer.index_record()? {
75    ///     records.push(record);
76    /// }
77    ///
78    /// let expected = [
79    ///     fai::Record::new("sq0", 4, 5, 4, 5),
80    ///     fai::Record::new("sq1", 10, 15, 4, 5),
81    /// ];
82    ///
83    /// assert_eq!(records, expected);
84    /// # Ok::<_, io::Error>(())
85    /// ```
86    pub fn index_record(&mut self) -> Result<Option<Record>, IndexError> {
87        let definition = match self.read_definition() {
88            Ok(None) => return Ok(None),
89            Ok(Some(d)) => d,
90            Err(e) => return Err(e.into()),
91        };
92
93        let offset = self.offset;
94        let mut length = 0;
95
96        let (line_width, line_bases) = self.consume_sequence_line()?;
97        let (mut prev_line_width, mut prev_line_bases) = (line_width, line_bases);
98
99        loop {
100            self.offset += prev_line_width as u64;
101            length += prev_line_bases;
102
103            match self.consume_sequence_line() {
104                Ok((0, _)) => break,
105                Ok((bytes_read, base_count)) => {
106                    if line_bases != prev_line_bases {
107                        return Err(IndexError::InvalidLineBases(line_bases, prev_line_bases));
108                    } else if line_width != prev_line_width {
109                        return Err(IndexError::InvalidLineWidth(line_width, prev_line_width));
110                    }
111
112                    prev_line_width = bytes_read;
113                    prev_line_bases = base_count;
114                }
115                Err(e) => return Err(IndexError::IoError(e)),
116            }
117        }
118
119        if length == 0 {
120            return Err(IndexError::EmptySequence(self.offset));
121        }
122
123        let record = Record::new(
124            definition.name(),
125            length as u64,
126            offset,
127            line_bases as u64,
128            line_width as u64,
129        );
130
131        Ok(Some(record))
132    }
133
134    fn read_definition(&mut self) -> io::Result<Option<Definition>> {
135        let mut buf = String::new();
136
137        match read_line(&mut self.inner, &mut buf) {
138            Ok(0) => return Ok(None),
139            Ok(n) => self.offset += n as u64,
140            Err(e) => return Err(e),
141        }
142
143        buf.parse()
144            .map(Some)
145            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
146    }
147}
148
149fn consume_sequence_line<R>(reader: &mut R) -> io::Result<(usize, usize)>
150where
151    R: BufRead,
152{
153    const LINE_FEED: u8 = b'\n';
154    const CARRIAGE_RETURN: u8 = b'\r';
155
156    fn count_bases(buf: &[u8]) -> usize {
157        if buf.ends_with(&[CARRIAGE_RETURN]) {
158            buf.len() - 1
159        } else {
160            buf.len()
161        }
162    }
163
164    let mut bytes_read = 0;
165    let mut base_count = 0;
166    let mut is_eol = false;
167
168    loop {
169        let src = reader.fill_buf()?;
170
171        if is_eol || src.is_empty() || src[0] == DEFINITION_PREFIX {
172            break;
173        }
174
175        let (chunk_len, chunk_base_count) = match memchr(LINE_FEED, src) {
176            Some(i) => {
177                is_eol = true;
178                (i + 1, count_bases(&src[..i]))
179            }
180            None => (src.len(), count_bases(src)),
181        };
182
183        reader.consume(chunk_len);
184
185        bytes_read += chunk_len;
186        base_count += chunk_base_count;
187    }
188
189    Ok((bytes_read, base_count))
190}
191
192#[derive(Debug)]
193pub enum IndexError {
194    EmptySequence(u64),
195    InvalidDefinition(ParseError),
196    InvalidLineBases(usize, usize),
197    InvalidLineWidth(usize, usize),
198    IoError(io::Error),
199}
200
201impl Error for IndexError {
202    fn source(&self) -> Option<&(dyn Error + 'static)> {
203        match self {
204            Self::EmptySequence(_) => None,
205            Self::InvalidDefinition(e) => Some(e),
206            Self::InvalidLineBases(..) => None,
207            Self::InvalidLineWidth(..) => None,
208            Self::IoError(e) => Some(e),
209        }
210    }
211}
212
213impl fmt::Display for IndexError {
214    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215        match self {
216            Self::EmptySequence(offset) => write!(f, "empty sequence at offset {offset}"),
217            Self::InvalidDefinition(e) => e.fmt(f),
218            Self::InvalidLineBases(expected, actual) => {
219                write!(f, "invalid line bases: expected {expected}, got {actual}")
220            }
221            Self::InvalidLineWidth(expected, actual) => {
222                write!(f, "invalid line width: expected {expected}, got {actual}")
223            }
224            Self::IoError(e) => e.fmt(f),
225        }
226    }
227}
228
229impl From<io::Error> for IndexError {
230    fn from(error: io::Error) -> Self {
231        Self::IoError(error)
232    }
233}
234
235impl From<ParseError> for IndexError {
236    fn from(error: ParseError) -> Self {
237        Self::InvalidDefinition(error)
238    }
239}
240
241impl From<IndexError> for io::Error {
242    fn from(error: IndexError) -> Self {
243        match error {
244            IndexError::IoError(e) => e,
245            _ => Self::new(io::ErrorKind::InvalidInput, error),
246        }
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253
254    #[test]
255    fn test_index_record_with_invalid_line_bases() {
256        let data = b">sq0\nACGT\nACG\nACGT\nAC\n";
257        let mut indexer = Indexer::new(&data[..]);
258
259        assert!(matches!(
260            indexer.index_record(),
261            Err(IndexError::InvalidLineBases(4, 3))
262        ));
263    }
264
265    #[test]
266    fn test_index_record_with_invalid_line_width() {
267        let data = b">sq0\nACGT\nACGT\r\nACGT\nAC\n";
268        let mut indexer = Indexer::new(&data[..]);
269
270        assert!(matches!(
271            indexer.index_record(),
272            Err(IndexError::InvalidLineWidth(5, 6))
273        ));
274    }
275
276    #[test]
277    fn test_index_record_with_empty_sequence() {
278        let data = b">sq0\n";
279        let mut indexer = Indexer::new(&data[..]);
280
281        assert!(matches!(
282            indexer.index_record(),
283            Err(IndexError::EmptySequence(5))
284        ));
285    }
286
287    #[test]
288    fn test_consume_sequence_line() -> io::Result<()> {
289        use std::io::BufReader;
290
291        let data = b"ACGT\nNNNN\n";
292        let mut reader = &data[..];
293        let (len, base_count) = consume_sequence_line(&mut reader)?;
294        assert_eq!(len, 5);
295        assert_eq!(base_count, 4);
296
297        let data = b"ACGT\r\nNNNN\r\n";
298        let mut reader = &data[..];
299        let (len, base_count) = consume_sequence_line(&mut reader)?;
300        assert_eq!(len, 6);
301        assert_eq!(base_count, 4);
302
303        let data = b"ACGT\r\nNNNN\r\n";
304        let mut reader = BufReader::with_capacity(3, &data[..]);
305        let (len, base_count) = consume_sequence_line(&mut reader)?;
306        assert_eq!(len, 6);
307        assert_eq!(base_count, 4);
308
309        Ok(())
310    }
311}