angsd-saf 0.1.0

Reading and writing of ANGSD SAF file format
Documentation
//! Reading of the SAF format.

use std::io;

use crate::ReadStatus;

use super::{
    index::Index,
    record::{Id, Record},
    version::{Version, V3, V4},
};

mod builder;
pub use builder::Builder;

mod intersect;
pub use intersect::Intersect;

mod traits;
pub(crate) use traits::ReaderExt;

/// A SAF reader for the [`V3`] format.
pub type ReaderV3<R> = Reader<R, V3>;

/// A SAF reader for the [`V4`] format.
pub type ReaderV4<R> = Reader<R, V4>;

/// A SAF reader.
///
/// The reader is generic over the inner reader type and over the SAF [`Version`] being read.
/// Version-specific aliases [`ReaderV3`] and [`ReaderV4`] are provided for convenience.
pub struct Reader<R, V> {
    location: Location<V>,
    position_reader: bgzf::Reader<R>,
    item_reader: bgzf::Reader<R>,
}

impl<R, V> Reader<R, V>
where
    R: io::BufRead,
    V: Version,
{
    /// Returns a new record suitable for use in reading.
    pub fn create_record_buf(&self) -> Record<Id, V::Item> {
        V::create_record_buf(self.index())
    }

    /// Creates a new reader from its raw parts.
    ///
    /// A [`Builder`] will typically be a more ergonimic way to create a reader.
    ///
    /// Returns [`None`] if index contains no records.
    pub fn from_bgzf(
        index: Index<V>,
        position_reader: bgzf::Reader<R>,
        item_reader: bgzf::Reader<R>,
    ) -> Option<Self> {
        Location::setup(index).map(|location| Self {
            location,
            position_reader,
            item_reader,
        })
    }

    /// Returns the index.
    pub fn index(&self) -> &Index<V> {
        &self.location.index
    }

    /// Returns a mutable reference to the index.
    pub fn index_mut(&mut self) -> &mut Index<V> {
        &mut self.location.index
    }

    /// Returns the inner index, position reader, and item reader, consuming `self`.
    pub fn into_parts(self) -> (Index<V>, bgzf::Reader<R>, bgzf::Reader<R>) {
        (self.location.index, self.position_reader, self.item_reader)
    }

    /// Returns the inner item reader.
    pub fn item_reader(&self) -> &bgzf::Reader<R> {
        &self.item_reader
    }

    /// Returns a mutable reference to the inner item reader.
    pub fn item_reader_mut(&mut self) -> &mut bgzf::Reader<R> {
        &mut self.item_reader
    }

    /// Returns the inner position reader.
    pub fn position_reader(&self) -> &bgzf::Reader<R> {
        &self.position_reader
    }

    /// Returns a mutable reference to the inner position reader.
    pub fn position_reader_mut(&mut self) -> &mut bgzf::Reader<R> {
        &mut self.position_reader
    }

    /// Reads a single item from the item reader into the provided buffer.
    ///
    /// Note that this will bring the item and position readers out of sync. Use
    /// [`Self::read_record`] instead unless you wish to manually re-sync the underlying readers.
    pub fn read_item(&mut self, buf: &mut V::Item) -> io::Result<ReadStatus> {
        V::read_item(&mut self.item_reader, buf)
    }

    /// Reads and checks the magic numbers.
    ///
    /// Assumes the streams are positioned at the beginning of the files.
    pub fn read_magic(&mut self) -> io::Result<()> {
        V::read_magic(&mut self.position_reader).and_then(|_| V::read_magic(&mut self.item_reader))
    }

    /// Reads a single position from the position reader.
    ///
    /// Note that this will bring the item and position readers out of sync. Use
    /// [`Self::read_record`] instead unless you wish to manually re-sync the underlying readers.
    pub fn read_position(&mut self) -> io::Result<Option<u32>> {
        self.position_reader.read_position()
    }

    /// Reads a single record.
    ///
    /// Note that the record buffer needs to be correctly set up. Use [`Self::create_record_buf`]
    /// for a correctly initialised record buffer to use for reading.
    pub fn read_record(&mut self, record: &mut Record<Id, V::Item>) -> io::Result<ReadStatus> {
        if !self.location.contig_is_finished() || self.location.next_contig().is_some() {
            // Index still contains data, read and check that readers are not at EoF
            match (self.read_position()?, self.read_item(record.item_mut())?) {
                (Some(pos), ReadStatus::NotDone) => {
                    *record.contig_id_mut() = self.location.contig_id;
                    *record.position_mut() = pos;

                    self.location.next_site_on_contig();

                    Ok(ReadStatus::NotDone)
                }
                (Some(_), ReadStatus::Done) => Err(eof_err(
                    "reached EoF in SAF position file before reaching EoF in SAF item file",
                )),
                (None, ReadStatus::NotDone) => Err(eof_err(
                    "reached EoF in SAF item file before reaching EoF in SAF position file",
                )),
                (None, ReadStatus::Done) => Err(eof_err(
                    "reached EoF in both SAF files before reaching end of index",
                )),
            }
        } else {
            // Reached end of index, check that readers are at EoF
            let position_reader_is_done = ReadStatus::check(&mut self.position_reader)?.is_done();
            let item_reader_is_done = ReadStatus::check(&mut self.item_reader)?.is_done();

            match (position_reader_is_done, item_reader_is_done) {
                (true, true) => Ok(ReadStatus::Done),
                (true, false) => Err(data_err(
                    "reached end of index before reaching EoF in SAF position file",
                )),
                (false, true) => Err(data_err(
                    "reached end of index before reaching EoF in SAF item file",
                )),
                (false, false) => Err(data_err(
                    "reached end of index before reaching EoF in both SAF files",
                )),
            }
        }
    }
}

impl<R, V> Reader<R, V>
where
    R: io::BufRead + io::Seek,
    V: Version,
{
    /// Creates an intersection of two readers.
    ///
    /// The resulting intersecting readers will read only records that lie on the same contigs
    /// and the same positions. Further readers can be added to the resulting intersecting reader
    /// by chaining the [`Intersect::intersect`] method.
    pub fn intersect(self, other: Self) -> Intersect<R, V> {
        Intersect::from_reader(self).intersect(other)
    }

    /// Seeks to start of contig.
    ///
    /// The `contig_id` refers to the position of records in the index.
    ///
    /// # Panics
    ///
    /// Panics if `contig_id` is larger than the number of records defined in the index.
    pub fn seek(&mut self, contig_id: usize) -> io::Result<()> {
        self.location
            .set_contig(contig_id)
            .expect("cannot seek to contig ID");

        let record = &self.index().records()[contig_id];
        let position_offset = record.position_offset();
        let item_offset = record.item_offset();

        let position_vpos = bgzf::VirtualPosition::from(position_offset);
        self.position_reader.seek(position_vpos)?;

        let item_vpos = bgzf::VirtualPosition::from(item_offset);
        self.item_reader.seek(item_vpos)?;

        Ok(())
    }

    /// Seeks to start of contig by name.
    ///
    /// Note that this requires a linear search of names in the index with worst time complexity
    /// linear in the index size.. If the index is large, and the contig ID is known, prefer
    /// [`Self::seek`] is more efficient.
    ///
    /// # Panics
    ///
    /// Panics if sequence name is not defined in index.
    pub fn seek_by_name(&mut self, name: &str) -> io::Result<()> {
        let contig_id = self
            .index()
            .records()
            .iter()
            .position(|x| x.name() == name)
            .expect("name not found in index");

        self.seek(contig_id)
    }
}

/// A SAF reader location.
///
/// The location tracks the current location of the reader relative to its index file in terms
/// of which contig is currently being read, and how many sites are left on that contig.
#[derive(Clone, Debug, Eq, PartialEq)]
struct Location<V> {
    pub index: Index<V>,
    pub contig_id: usize,
    pub sites_left_on_contig: usize,
}

impl<V> Location<V>
where
    V: Version,
{
    /// Returns `true` if no more sites are left to read on the current contig.
    pub fn contig_is_finished(&self) -> bool {
        0 == self.sites_left_on_contig
    }

    /// Decrements the number of sites left to read on current contig.
    pub fn next_site_on_contig(&mut self) {
        self.sites_left_on_contig -= 1
    }

    /// Moves the location first site on the next contig in index.
    ///
    /// Returns `None` is no more contigs exist in the index.
    pub fn next_contig(&mut self) -> Option<()> {
        self.set_contig(self.contig_id + 1)
    }

    /// Moves the location to the first site on the contig with the provided ID in the index.
    ///
    /// Returns `None` if contig with the provided ID does not exist in the index.
    pub fn set_contig(&mut self, contig_id: usize) -> Option<()> {
        self.contig_id = contig_id;
        self.sites_left_on_contig = self.index.records().get(self.contig_id)?.sites();
        Some(())
    }

    /// Creates a new location from an index.
    ///
    /// The location will be set to the first site on the first contig. Returns `None` if no contigs
    /// are defined in the index.
    pub fn setup(index: Index<V>) -> Option<Self> {
        let contig_id = 0;
        let sites_left_on_contig = index.records().first()?.sites();

        Some(Self {
            index,
            contig_id,
            sites_left_on_contig,
        })
    }
}

fn eof_err(msg: &str) -> io::Error {
    io::Error::new(io::ErrorKind::UnexpectedEof, msg)
}

fn data_err(msg: &str) -> io::Error {
    io::Error::new(io::ErrorKind::InvalidData, msg)
}