noodles_gff/io/
reader.rs

1//! GFF reader and iterators.
2
3pub(crate) mod line;
4mod line_bufs;
5mod lines;
6mod record_bufs;
7
8use std::io::{self, BufRead, Read, Seek};
9
10use noodles_bgzf as bgzf;
11use noodles_core::Region;
12use noodles_csi::{self as csi, BinningIndex};
13
14pub use self::{line_bufs::LineBufs, lines::Lines, record_bufs::RecordBufs};
15use crate::{Line, feature::RecordBuf};
16
17/// A GFF reader.
18pub struct Reader<R> {
19    inner: R,
20}
21
22impl<R> Reader<R> {
23    /// Returns a reference to the underlying reader.
24    ///
25    /// # Examples
26    ///
27    /// ```
28    /// # use std::io;
29    /// use noodles_gff as gff;
30    /// let reader = gff::io::Reader::new(io::empty());
31    /// let _ = reader.get_ref();
32    /// ```
33    pub fn get_ref(&self) -> &R {
34        &self.inner
35    }
36
37    /// Returns a mutable reference to the underlying reader.
38    ///
39    /// # Examples
40    ///
41    /// ```
42    /// # use std::io;
43    /// use noodles_gff as gff;
44    /// let mut reader = gff::io::Reader::new(io::empty());
45    /// let _ = reader.get_mut();
46    /// ```
47    pub fn get_mut(&mut self) -> &mut R {
48        &mut self.inner
49    }
50
51    /// Unwraps and returns the underlying reader.
52    ///
53    /// # Examples
54    ///
55    /// ```
56    /// # use std::io;
57    /// use noodles_gff as gff;
58    /// let reader = gff::io::Reader::new(io::empty());
59    /// let _ = reader.into_inner();
60    /// ```
61    pub fn into_inner(self) -> R {
62        self.inner
63    }
64}
65
66impl<R> Reader<R>
67where
68    R: BufRead,
69{
70    /// Creates a GFF reader.
71    ///
72    /// # Examples
73    ///
74    /// ```
75    /// # use std::io;
76    /// use noodles_gff as gff;
77    /// let reader = gff::io::Reader::new(io::empty());
78    /// ```
79    pub fn new(inner: R) -> Self {
80        Self { inner }
81    }
82
83    /// Returns an iterator over line buffers starting from the current stream position.
84    ///
85    /// When using this, the caller is responsible to stop reading at either EOF or when the
86    /// `FASTA` directive is read, whichever comes first.
87    ///
88    /// Unlike [`Self::read_line`], each line is parsed as a [`crate::Line`].
89    ///
90    /// # Examples
91    ///
92    /// ```
93    /// use noodles_gff::{self as gff, LineBuf};
94    ///
95    /// let data = b"##gff-version 3
96    /// sq0\tNOODLES\tgene\t8\t13\t.\t+\t.\tgene_id=ndls0;gene_name=gene0
97    /// ";
98    /// let mut reader = gff::io::Reader::new(&data[..]);
99    /// let mut lines = reader.line_bufs();
100    ///
101    /// let line = lines.next().transpose()?;
102    /// assert!(matches!(line, Some(LineBuf::Directive(_))));
103    ///
104    /// let line = lines.next().transpose()?;
105    /// assert!(matches!(line, Some(LineBuf::Record(_))));
106    ///
107    /// assert!(lines.next().is_none());
108    /// # Ok::<_, std::io::Error>(())
109    /// ```
110    pub fn line_bufs(&mut self) -> LineBufs<'_, R> {
111        LineBufs::new(self)
112    }
113
114    /// Reads a single line without eagerly decoding it.
115    ///
116    /// # Examples
117    ///
118    /// ```
119    /// use noodles_gff as gff;
120    ///
121    /// let data = b"##gff-version 3\n";
122    /// let mut reader = gff::io::Reader::new(&data[..]);
123    ///
124    /// let mut line = gff::Line::default();
125    ///
126    /// reader.read_line(&mut line)?;
127    /// assert_eq!(line.kind(), gff::line::Kind::Directive);
128    ///
129    /// assert_eq!(reader.read_line(&mut line)?, 0);
130    /// # Ok::<_, std::io::Error>(())
131    /// ```
132    pub fn read_line(&mut self, line: &mut Line) -> io::Result<usize> {
133        line::read_line(&mut self.inner, line)
134    }
135
136    /// Returns an iterator over lines starting from the current stream position.
137    ///
138    /// When using this, the caller is responsible to stop reading at either EOF or when the
139    /// `FASTA` directive is read, whichever comes first.
140    ///
141    /// # Examples
142    ///
143    /// ```
144    /// # use std::io;
145    /// use noodles_gff::{self as gff, directive_buf::key};
146    ///
147    /// let mut reader = gff::io::Reader::new(io::empty());
148    ///
149    /// for result in reader.lines() {
150    ///     let line = result?;
151    ///
152    ///     if let Some(key::FASTA) = line.as_directive().map(|directive| directive.key().as_ref()) {
153    ///         break;
154    ///     }
155    ///
156    ///     // ...
157    /// }
158    /// # Ok::<_, io::Error>(())
159    /// ```
160    pub fn lines(&mut self) -> Lines<'_, R> {
161        Lines::new(self)
162    }
163
164    /// Returns an iterator over records starting from the current stream position.
165    ///
166    /// This filters lines for only records. It stops at either EOF or when the `FASTA` directive
167    /// is read, whichever comes first.
168    ///
169    /// # Examples
170    ///
171    /// ```
172    /// use noodles_gff as gff;
173    ///
174    /// let data = b"##gff-version 3
175    /// sq0\tNOODLES\tgene\t8\t13\t.\t+\t.\tgene_id=ndls0;gene_name=gene0
176    /// ";
177    /// let mut reader = gff::io::Reader::new(&data[..]);
178    /// let mut records = reader.record_bufs();
179    ///
180    /// assert!(records.next().transpose()?.is_some());
181    /// assert!(records.next().is_none());
182    /// # Ok::<_, std::io::Error>(())
183    /// ```
184    pub fn record_bufs(&mut self) -> RecordBufs<'_, R> {
185        RecordBufs::new(self.line_bufs())
186    }
187}
188
189impl<R> Reader<bgzf::io::Reader<R>>
190where
191    R: Read + Seek,
192{
193    /// Returns an iterator over records that intersects the given region.
194    ///
195    /// # Examples
196    ///
197    /// ```no_run
198    /// # use std::fs::File;
199    /// use noodles_bgzf as bgzf;
200    /// use noodles_csi as csi;
201    /// use noodles_gff as gff;
202    ///
203    /// let mut reader = File::open("annotations.gff3.gz")
204    ///     .map(bgzf::io::Reader::new)
205    ///     .map(gff::io::Reader::new)?;
206    ///
207    /// let index = csi::fs::read("annotations.gff3.gz.csi")?;
208    /// let region = "sq0:8-13".parse()?;
209    /// let query = reader.query(&index, &region)?;
210    ///
211    /// for result in query {
212    ///     let record = result?;
213    ///     // ...
214    /// }
215    ///
216    /// # Ok::<(), Box<dyn std::error::Error>>(())
217    /// ```
218    pub fn query<'r, I>(
219        &'r mut self,
220        index: &I,
221        region: &'r Region,
222    ) -> io::Result<impl Iterator<Item = io::Result<RecordBuf>> + use<'r, I, R>>
223    where
224        I: BinningIndex,
225    {
226        let header = index
227            .header()
228            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "missing index header"))?;
229
230        let reference_sequence_id = header
231            .reference_sequence_names()
232            .get_index_of(region.name())
233            .ok_or_else(|| {
234                io::Error::new(
235                    io::ErrorKind::InvalidInput,
236                    "missing reference sequence name",
237                )
238            })?;
239
240        let chunks = index.query(reference_sequence_id, region.interval())?;
241
242        let records = csi::io::Query::new(&mut self.inner, chunks)
243            .indexed_records(header)
244            .filter_by_region(region)
245            .map(|result| {
246                result.and_then(|r| {
247                    let line = Line(r.as_ref().into());
248
249                    line.as_record()
250                        .ok_or_else(|| {
251                            io::Error::new(io::ErrorKind::InvalidData, "line is not a record")
252                        })?
253                        .and_then(|record| RecordBuf::try_from_feature_record(&record))
254                })
255            });
256
257        Ok(records)
258    }
259}
260
261fn read_line<R>(reader: &mut R, buf: &mut Vec<u8>) -> io::Result<usize>
262where
263    R: BufRead,
264{
265    const LINE_FEED: u8 = b'\n';
266    const CARRIAGE_RETURN: u8 = b'\r';
267
268    match reader.read_until(LINE_FEED, buf)? {
269        0 => Ok(0),
270        n => {
271            if buf.ends_with(&[LINE_FEED]) {
272                buf.pop();
273
274                if buf.ends_with(&[CARRIAGE_RETURN]) {
275                    buf.pop();
276                }
277            }
278
279            Ok(n)
280        }
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    #[test]
289    fn test_records() -> io::Result<()> {
290        let data = b"\
291##gff-version 3
292sq0\tNOODLES\tgene\t8\t13\t.\t+\t.\tgene_id=ndls0;gene_name=gene0
293";
294
295        let mut reader = Reader::new(&data[..]);
296        let mut n = 0;
297
298        for result in reader.record_bufs() {
299            let _ = result?;
300            n += 1;
301        }
302
303        assert_eq!(n, 1);
304
305        Ok(())
306    }
307
308    #[test]
309    fn test_records_with_fasta_directive() -> io::Result<()> {
310        let data = b"\
311##gff-version 3
312sq0\tNOODLES\tgene\t8\t13\t.\t+\t.\tgene_id=ndls0;gene_name=gene0
313##FASTA
314>sq0
315ACGT
316";
317
318        let mut reader = Reader::new(&data[..]);
319        let mut n = 0;
320
321        for result in reader.record_bufs() {
322            let _ = result?;
323            n += 1;
324        }
325
326        assert_eq!(n, 1);
327
328        Ok(())
329    }
330
331    #[test]
332    fn test_read_line() -> io::Result<()> {
333        fn t(buf: &mut Vec<u8>, mut reader: &[u8], expected: &[u8]) -> io::Result<()> {
334            buf.clear();
335            read_line(&mut reader, buf)?;
336            assert_eq!(buf, expected);
337            Ok(())
338        }
339
340        let mut buf = Vec::new();
341
342        t(&mut buf, b"noodles\n", b"noodles")?;
343        t(&mut buf, b"noodles\r\n", b"noodles")?;
344        t(&mut buf, b"noodles", b"noodles")?;
345
346        Ok(())
347    }
348}