noodles_fasta/io/
indexer.rs1use std::{
4 error::Error,
5 fmt,
6 io::{self, BufRead},
7};
8
9use memchr::memchr;
10
11use super::reader::{DEFINITION_PREFIX, read_line};
12use crate::{
13 fai::Record,
14 record::definition::{Definition, ParseError},
15};
16
17pub struct Indexer<R> {
19 inner: R,
20 offset: u64,
21}
22
23impl<R> Indexer<R>
24where
25 R: BufRead,
26{
27 pub fn new(inner: R) -> Self {
37 Self { inner, offset: 0 }
38 }
39
40 fn consume_sequence_line(&mut self) -> io::Result<(usize, usize)> {
46 consume_sequence_line(&mut self.inner)
47 }
48
49 pub fn index_record(&mut self) -> Result<Option<Record>, IndexError> {
87 let definition = match self.read_definition() {
88 Ok(None) => return Ok(None),
89 Ok(Some(d)) => d,
90 Err(e) => return Err(e.into()),
91 };
92
93 let offset = self.offset;
94 let mut length = 0;
95
96 let (line_width, line_bases) = self.consume_sequence_line()?;
97 let (mut prev_line_width, mut prev_line_bases) = (line_width, line_bases);
98
99 loop {
100 self.offset += prev_line_width as u64;
101 length += prev_line_bases;
102
103 match self.consume_sequence_line() {
104 Ok((0, _)) => break,
105 Ok((bytes_read, base_count)) => {
106 if line_bases != prev_line_bases {
107 return Err(IndexError::InvalidLineBases(line_bases, prev_line_bases));
108 } else if line_width != prev_line_width {
109 return Err(IndexError::InvalidLineWidth(line_width, prev_line_width));
110 }
111
112 prev_line_width = bytes_read;
113 prev_line_bases = base_count;
114 }
115 Err(e) => return Err(IndexError::IoError(e)),
116 }
117 }
118
119 if length == 0 {
120 return Err(IndexError::EmptySequence(self.offset));
121 }
122
123 let record = Record::new(
124 definition.name(),
125 length as u64,
126 offset,
127 line_bases as u64,
128 line_width as u64,
129 );
130
131 Ok(Some(record))
132 }
133
134 fn read_definition(&mut self) -> io::Result<Option<Definition>> {
135 let mut buf = String::new();
136
137 match read_line(&mut self.inner, &mut buf) {
138 Ok(0) => return Ok(None),
139 Ok(n) => self.offset += n as u64,
140 Err(e) => return Err(e),
141 }
142
143 buf.parse()
144 .map(Some)
145 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
146 }
147}
148
149fn consume_sequence_line<R>(reader: &mut R) -> io::Result<(usize, usize)>
150where
151 R: BufRead,
152{
153 const LINE_FEED: u8 = b'\n';
154 const CARRIAGE_RETURN: u8 = b'\r';
155
156 fn count_bases(buf: &[u8]) -> usize {
157 if buf.ends_with(&[CARRIAGE_RETURN]) {
158 buf.len() - 1
159 } else {
160 buf.len()
161 }
162 }
163
164 let mut bytes_read = 0;
165 let mut base_count = 0;
166 let mut is_eol = false;
167
168 loop {
169 let src = reader.fill_buf()?;
170
171 if is_eol || src.is_empty() || src[0] == DEFINITION_PREFIX {
172 break;
173 }
174
175 let (chunk_len, chunk_base_count) = match memchr(LINE_FEED, src) {
176 Some(i) => {
177 is_eol = true;
178 (i + 1, count_bases(&src[..i]))
179 }
180 None => (src.len(), count_bases(src)),
181 };
182
183 reader.consume(chunk_len);
184
185 bytes_read += chunk_len;
186 base_count += chunk_base_count;
187 }
188
189 Ok((bytes_read, base_count))
190}
191
192#[derive(Debug)]
193pub enum IndexError {
194 EmptySequence(u64),
195 InvalidDefinition(ParseError),
196 InvalidLineBases(usize, usize),
197 InvalidLineWidth(usize, usize),
198 IoError(io::Error),
199}
200
201impl Error for IndexError {
202 fn source(&self) -> Option<&(dyn Error + 'static)> {
203 match self {
204 Self::EmptySequence(_) => None,
205 Self::InvalidDefinition(e) => Some(e),
206 Self::InvalidLineBases(..) => None,
207 Self::InvalidLineWidth(..) => None,
208 Self::IoError(e) => Some(e),
209 }
210 }
211}
212
213impl fmt::Display for IndexError {
214 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215 match self {
216 Self::EmptySequence(offset) => write!(f, "empty sequence at offset {offset}"),
217 Self::InvalidDefinition(e) => e.fmt(f),
218 Self::InvalidLineBases(expected, actual) => {
219 write!(f, "invalid line bases: expected {expected}, got {actual}")
220 }
221 Self::InvalidLineWidth(expected, actual) => {
222 write!(f, "invalid line width: expected {expected}, got {actual}")
223 }
224 Self::IoError(e) => e.fmt(f),
225 }
226 }
227}
228
229impl From<io::Error> for IndexError {
230 fn from(error: io::Error) -> Self {
231 Self::IoError(error)
232 }
233}
234
235impl From<ParseError> for IndexError {
236 fn from(error: ParseError) -> Self {
237 Self::InvalidDefinition(error)
238 }
239}
240
241impl From<IndexError> for io::Error {
242 fn from(error: IndexError) -> Self {
243 match error {
244 IndexError::IoError(e) => e,
245 _ => Self::new(io::ErrorKind::InvalidInput, error),
246 }
247 }
248}
249
250#[cfg(test)]
251mod tests {
252 use super::*;
253
254 #[test]
255 fn test_index_record_with_invalid_line_bases() {
256 let data = b">sq0\nACGT\nACG\nACGT\nAC\n";
257 let mut indexer = Indexer::new(&data[..]);
258
259 assert!(matches!(
260 indexer.index_record(),
261 Err(IndexError::InvalidLineBases(4, 3))
262 ));
263 }
264
265 #[test]
266 fn test_index_record_with_invalid_line_width() {
267 let data = b">sq0\nACGT\nACGT\r\nACGT\nAC\n";
268 let mut indexer = Indexer::new(&data[..]);
269
270 assert!(matches!(
271 indexer.index_record(),
272 Err(IndexError::InvalidLineWidth(5, 6))
273 ));
274 }
275
276 #[test]
277 fn test_index_record_with_empty_sequence() {
278 let data = b">sq0\n";
279 let mut indexer = Indexer::new(&data[..]);
280
281 assert!(matches!(
282 indexer.index_record(),
283 Err(IndexError::EmptySequence(5))
284 ));
285 }
286
287 #[test]
288 fn test_consume_sequence_line() -> io::Result<()> {
289 use std::io::BufReader;
290
291 let data = b"ACGT\nNNNN\n";
292 let mut reader = &data[..];
293 let (len, base_count) = consume_sequence_line(&mut reader)?;
294 assert_eq!(len, 5);
295 assert_eq!(base_count, 4);
296
297 let data = b"ACGT\r\nNNNN\r\n";
298 let mut reader = &data[..];
299 let (len, base_count) = consume_sequence_line(&mut reader)?;
300 assert_eq!(len, 6);
301 assert_eq!(base_count, 4);
302
303 let data = b"ACGT\r\nNNNN\r\n";
304 let mut reader = BufReader::with_capacity(3, &data[..]);
305 let (len, base_count) = consume_sequence_line(&mut reader)?;
306 assert_eq!(len, 6);
307 assert_eq!(base_count, 4);
308
309 Ok(())
310 }
311}