angsd_saf/
reader.rs

1//! Reading of the SAF format.
2
3use std::io;
4
5use crate::ReadStatus;
6
7use super::{
8    index::Index,
9    record::{Id, Record},
10    version::{Version, V3, V4},
11};
12
13mod builder;
14pub use builder::Builder;
15
16mod intersect;
17pub use intersect::Intersect;
18
19mod traits;
20pub(crate) use traits::ReaderExt;
21
22/// A SAF reader for the [`V3`] format.
23pub type ReaderV3<R> = Reader<R, V3>;
24
25/// A SAF reader for the [`V4`] format.
26pub type ReaderV4<R> = Reader<R, V4>;
27
28/// A SAF reader.
29///
30/// The reader is generic over the inner reader type and over the SAF [`Version`] being read.
31/// Version-specific aliases [`ReaderV3`] and [`ReaderV4`] are provided for convenience.
32pub struct Reader<R, V> {
33    location: Location<V>,
34    position_reader: bgzf::Reader<R>,
35    item_reader: bgzf::Reader<R>,
36}
37
38impl<R, V> Reader<R, V>
39where
40    R: io::BufRead,
41    V: Version,
42{
43    /// Returns a new record suitable for use in reading.
44    pub fn create_record_buf(&self) -> Record<Id, V::Item> {
45        V::create_record_buf(self.index())
46    }
47
48    /// Creates a new reader from its raw parts.
49    ///
50    /// A [`Builder`] will typically be a more ergonimic way to create a reader.
51    ///
52    /// Returns [`None`] if index contains no records.
53    pub fn from_bgzf(
54        index: Index<V>,
55        position_reader: bgzf::Reader<R>,
56        item_reader: bgzf::Reader<R>,
57    ) -> Option<Self> {
58        Location::setup(index).map(|location| Self {
59            location,
60            position_reader,
61            item_reader,
62        })
63    }
64
65    /// Returns the index.
66    pub fn index(&self) -> &Index<V> {
67        &self.location.index
68    }
69
70    /// Returns a mutable reference to the index.
71    pub fn index_mut(&mut self) -> &mut Index<V> {
72        &mut self.location.index
73    }
74
75    /// Returns the inner index, position reader, and item reader, consuming `self`.
76    pub fn into_parts(self) -> (Index<V>, bgzf::Reader<R>, bgzf::Reader<R>) {
77        (self.location.index, self.position_reader, self.item_reader)
78    }
79
80    /// Returns the inner item reader.
81    pub fn item_reader(&self) -> &bgzf::Reader<R> {
82        &self.item_reader
83    }
84
85    /// Returns a mutable reference to the inner item reader.
86    pub fn item_reader_mut(&mut self) -> &mut bgzf::Reader<R> {
87        &mut self.item_reader
88    }
89
90    /// Returns the inner position reader.
91    pub fn position_reader(&self) -> &bgzf::Reader<R> {
92        &self.position_reader
93    }
94
95    /// Returns a mutable reference to the inner position reader.
96    pub fn position_reader_mut(&mut self) -> &mut bgzf::Reader<R> {
97        &mut self.position_reader
98    }
99
100    /// Reads a single item from the item reader into the provided buffer.
101    ///
102    /// Note that this will bring the item and position readers out of sync. Use
103    /// [`Self::read_record`] instead unless you wish to manually re-sync the underlying readers.
104    pub fn read_item(&mut self, buf: &mut V::Item) -> io::Result<ReadStatus> {
105        V::read_item(&mut self.item_reader, buf)
106    }
107
108    /// Reads and checks the magic numbers.
109    ///
110    /// Assumes the streams are positioned at the beginning of the files.
111    pub fn read_magic(&mut self) -> io::Result<()> {
112        V::read_magic(&mut self.position_reader).and_then(|_| V::read_magic(&mut self.item_reader))
113    }
114
115    /// Reads a single position from the position reader.
116    ///
117    /// Note that this will bring the item and position readers out of sync. Use
118    /// [`Self::read_record`] instead unless you wish to manually re-sync the underlying readers.
119    pub fn read_position(&mut self) -> io::Result<Option<u32>> {
120        self.position_reader.read_position()
121    }
122
123    /// Reads a single record.
124    ///
125    /// Note that the record buffer needs to be correctly set up. Use [`Self::create_record_buf`]
126    /// for a correctly initialised record buffer to use for reading.
127    pub fn read_record(&mut self, record: &mut Record<Id, V::Item>) -> io::Result<ReadStatus> {
128        if !self.location.contig_is_finished() || self.location.next_contig().is_some() {
129            // Index still contains data, read and check that readers are not at EoF
130            match (self.read_position()?, self.read_item(record.item_mut())?) {
131                (Some(pos), ReadStatus::NotDone) => {
132                    *record.contig_id_mut() = self.location.contig_id;
133                    *record.position_mut() = pos;
134
135                    self.location.next_site_on_contig();
136
137                    Ok(ReadStatus::NotDone)
138                }
139                (Some(_), ReadStatus::Done) => Err(eof_err(
140                    "reached EoF in SAF position file before reaching EoF in SAF item file",
141                )),
142                (None, ReadStatus::NotDone) => Err(eof_err(
143                    "reached EoF in SAF item file before reaching EoF in SAF position file",
144                )),
145                (None, ReadStatus::Done) => Err(eof_err(
146                    "reached EoF in both SAF files before reaching end of index",
147                )),
148            }
149        } else {
150            // Reached end of index, check that readers are at EoF
151            let position_reader_is_done = ReadStatus::check(&mut self.position_reader)?.is_done();
152            let item_reader_is_done = ReadStatus::check(&mut self.item_reader)?.is_done();
153
154            match (position_reader_is_done, item_reader_is_done) {
155                (true, true) => Ok(ReadStatus::Done),
156                (true, false) => Err(data_err(
157                    "reached end of index before reaching EoF in SAF position file",
158                )),
159                (false, true) => Err(data_err(
160                    "reached end of index before reaching EoF in SAF item file",
161                )),
162                (false, false) => Err(data_err(
163                    "reached end of index before reaching EoF in both SAF files",
164                )),
165            }
166        }
167    }
168}
169
170impl<R, V> Reader<R, V>
171where
172    R: io::BufRead + io::Seek,
173    V: Version,
174{
175    /// Creates an intersection of two readers.
176    ///
177    /// The resulting intersecting readers will read only records that lie on the same contigs
178    /// and the same positions. Further readers can be added to the resulting intersecting reader
179    /// by chaining the [`Intersect::intersect`] method.
180    pub fn intersect(self, other: Self) -> Intersect<R, V> {
181        Intersect::from_reader(self).intersect(other)
182    }
183
184    /// Seeks to start of contig.
185    ///
186    /// The `contig_id` refers to the position of records in the index.
187    ///
188    /// # Panics
189    ///
190    /// Panics if `contig_id` is larger than the number of records defined in the index.
191    pub fn seek(&mut self, contig_id: usize) -> io::Result<()> {
192        self.location
193            .set_contig(contig_id)
194            .expect("cannot seek to contig ID");
195
196        let record = &self.index().records()[contig_id];
197        let position_offset = record.position_offset();
198        let item_offset = record.item_offset();
199
200        let position_vpos = bgzf::VirtualPosition::from(position_offset);
201        self.position_reader.seek(position_vpos)?;
202
203        let item_vpos = bgzf::VirtualPosition::from(item_offset);
204        self.item_reader.seek(item_vpos)?;
205
206        Ok(())
207    }
208
209    /// Seeks to start of contig by name.
210    ///
211    /// Note that this requires a linear search of names in the index with worst time complexity
212    /// linear in the index size.. If the index is large, and the contig ID is known, prefer
213    /// [`Self::seek`] is more efficient.
214    ///
215    /// # Panics
216    ///
217    /// Panics if sequence name is not defined in index.
218    pub fn seek_by_name(&mut self, name: &str) -> io::Result<()> {
219        let contig_id = self
220            .index()
221            .records()
222            .iter()
223            .position(|x| x.name() == name)
224            .expect("name not found in index");
225
226        self.seek(contig_id)
227    }
228}
229
230/// A SAF reader location.
231///
232/// The location tracks the current location of the reader relative to its index file in terms
233/// of which contig is currently being read, and how many sites are left on that contig.
234#[derive(Clone, Debug, Eq, PartialEq)]
235struct Location<V> {
236    pub index: Index<V>,
237    pub contig_id: usize,
238    pub sites_left_on_contig: usize,
239}
240
241impl<V> Location<V>
242where
243    V: Version,
244{
245    /// Returns `true` if no more sites are left to read on the current contig.
246    pub fn contig_is_finished(&self) -> bool {
247        0 == self.sites_left_on_contig
248    }
249
250    /// Decrements the number of sites left to read on current contig.
251    pub fn next_site_on_contig(&mut self) {
252        self.sites_left_on_contig -= 1
253    }
254
255    /// Moves the location first site on the next contig in index.
256    ///
257    /// Returns `None` is no more contigs exist in the index.
258    pub fn next_contig(&mut self) -> Option<()> {
259        self.set_contig(self.contig_id + 1)
260    }
261
262    /// Moves the location to the first site on the contig with the provided ID in the index.
263    ///
264    /// Returns `None` if contig with the provided ID does not exist in the index.
265    pub fn set_contig(&mut self, contig_id: usize) -> Option<()> {
266        self.contig_id = contig_id;
267        self.sites_left_on_contig = self.index.records().get(self.contig_id)?.sites();
268        Some(())
269    }
270
271    /// Creates a new location from an index.
272    ///
273    /// The location will be set to the first site on the first contig. Returns `None` if no contigs
274    /// are defined in the index.
275    pub fn setup(index: Index<V>) -> Option<Self> {
276        let contig_id = 0;
277        let sites_left_on_contig = index.records().first()?.sites();
278
279        Some(Self {
280            index,
281            contig_id,
282            sites_left_on_contig,
283        })
284    }
285}
286
287fn eof_err(msg: &str) -> io::Error {
288    io::Error::new(io::ErrorKind::UnexpectedEof, msg)
289}
290
291fn data_err(msg: &str) -> io::Error {
292    io::Error::new(io::ErrorKind::InvalidData, msg)
293}