segul 0.23.2

An ultrafast and memory-efficient tool for phylogenomics
Documentation
//! A simple fasta parser that reads a fasta file and returns a sequence matrix.

use std::fs::File;
use std::io::prelude::*;
use std::io::BufReader;
use std::path::Path;

use indexmap::{IndexMap, IndexSet};

use crate::helper::alphabet;
use crate::helper::sequence::SeqCheck;
use crate::helper::types::{DataType, Header, SeqMatrix};

/// Parse a fasta file sequence IDs.
/// return a set of sequence IDs.
pub fn parse_only_id(input: &Path) -> IndexSet<String> {
    let file = File::open(input).expect("Failed opening a fasta file.");
    let buff = BufReader::new(file);
    let mut ids = IndexSet::new();
    buff.lines()
        .map_while(Result::ok)
        .filter(|line| line.starts_with('>'))
        .for_each(|line| {
            if let Some(id) = line.strip_prefix('>') {
                ids.insert(id.trim().to_string());
            }
        });
    ids
}

/// Parse a fasta file and return a sequence matrix.
/// It does not allow duplicate IDs or spaces in the sequence IDs.
pub struct Fasta<'a> {
    input: &'a Path,
    datatype: &'a DataType,
    pub matrix: SeqMatrix,
    pub header: Header,
}

impl<'a> Fasta<'a> {
    /// Create a new Fasta instance.
    pub fn new(input: &'a Path, datatype: &'a DataType) -> Self {
        Self {
            input,
            datatype,
            matrix: IndexMap::new(),
            header: Header::new(),
        }
    }

    /// Parse a fasta file.
    pub fn parse(&mut self) {
        let file = File::open(self.input).expect("Failed opening a fasta file");
        let buff = BufReader::new(file);
        self.parse_matrix(buff);
        let mut seq_info = SeqCheck::new();
        assert!(
            !self.matrix.is_empty(),
            "{} is empty. \
        Make sure the file format is fasta or it is not an empty file!",
            self.input.display()
        );
        seq_info.check(&self.matrix);
        self.header.aligned = seq_info.is_alignment;
        self.header.nchar = seq_info.longest;
        self.header.ntax = self.matrix.len();
        self.match_header_datatype();
    }

    fn parse_matrix<R: Read>(&mut self, buff: R) {
        let fasta = FastaReader::new(buff);
        fasta
            .into_iter()
            .for_each(|fas| match self.matrix.get(&fas.id) {
                Some(original_seq) => panic!(
                    "\nFound duplicate IDs for file {}. \
                    First duplicate ID found: {}. \
                Both sequences are the same: {}.\n
                ",
                    self.input.display(),
                    fas.id,
                    *original_seq == fas.seq
                ),
                None => {
                    alphabet::check_valid_seq(self.input, self.datatype, &fas.id, &fas.seq);
                    self.matrix.insert(fas.id, fas.seq);
                }
            });
    }

    #[inline]
    fn match_header_datatype(&mut self) {
        if let DataType::Aa = self.datatype {
            self.header.datatype = String::from("protein")
        };
    }
}

pub struct Records {
    pub id: String,
    pub seq: String,
}

impl Records {
    pub fn new(id: &str, seq: &str) -> Self {
        Self {
            id: String::from(id),
            seq: String::from(seq),
        }
    }
}

pub struct FastaReader<R> {
    reader: BufReader<R>,
    id: String,
    seq: String,
}

impl<R: Read> FastaReader<R> {
    pub fn new(file: R) -> Self {
        Self {
            reader: BufReader::new(file),
            id: String::new(),
            seq: String::new(),
        }
    }

    pub fn next_seq(&mut self) -> Option<Records> {
        while let Some(Ok(line)) = self.reader.by_ref().lines().next() {
            if let Some(id) = line.strip_prefix('>') {
                if self.id.is_empty() {
                    self.id = String::from(id.trim());
                    self.seq.clear();
                } else {
                    let recs = self.get_recs(&self.id, &self.seq);
                    self.id = String::from(id.trim());
                    self.seq.clear();
                    return Some(recs);
                }
            } else {
                self.seq.push_str(line.trim());
            }
        }

        // Return the last found record.
        if !self.id.is_empty() {
            let recs = self.get_recs(&self.id, &self.seq);
            self.id.clear();
            self.seq.clear();
            Some(recs)
        } else {
            None
        }
    }

    #[inline]
    fn get_recs(&self, id: &str, seq: &str) -> Records {
        Records::new(id, seq)
    }
}

impl<R: Read> Iterator for FastaReader<R> {
    type Item = Records;

    fn next(&mut self) -> Option<Self::Item> {
        self.next_seq()
    }
}

#[cfg(test)]
mod test {
    use super::*;

    const DNA: DataType = DataType::Dna;
    const AA: DataType = DataType::Aa;

    #[test]
    fn read_fasta_simple_test() {
        let path = Path::new("tests/files/simple.fas");
        let mut fasta = Fasta::new(path, &DNA);
        fasta.parse();

        assert_eq!(2, fasta.matrix.len());
    }

    #[test]
    fn check_is_alignment_test() {
        let path = Path::new("tests/files/simple.fas");
        let mut fasta = Fasta::new(path, &DNA);
        fasta.parse();

        assert!(fasta.header.aligned);
    }

    #[test]
    fn check_isnot_alignment_test() {
        let path = Path::new("tests/files/unaligned.fas");
        let mut fasta = Fasta::new(path, &DNA);
        fasta.parse();

        assert!(!fasta.header.aligned);
    }

    #[test]
    fn interleaved_fas_test() {
        let path = Path::new("tests/files/interleave.fas");
        let file = File::open(path).unwrap();
        let rec = FastaReader::new(file);
        let mut seq = IndexMap::new();
        rec.into_iter().for_each(|r| {
            seq.insert(r.id, r.seq);
        });

        let res = String::from("AGTATGATGTATATGTAT");
        let res_2 = String::from("AGTATGATGTATAAAAAA");
        assert_eq!(Some(&res), seq.get("ABCD"));
        assert_eq!(Some(&res_2), seq.get("ABCE"));
    }

    #[test]
    fn simple_aa_fas_test() {
        let sample = Path::new("tests/files/simple_aa.fas");
        let mut fas = Fasta::new(sample, &AA);
        fas.parse();
        let key = String::from("ABCE");
        let res = String::from("MAYPMQLGFQDATSPI");
        assert_eq!(Some(&res), fas.matrix.get(&key));
        assert_eq!(String::from("protein"), fas.header.datatype);
    }

    #[test]
    fn simple_trailing_space_fas_test() {
        let sample = Path::new("tests/files/simple_id_whitespaces.fas");
        let mut fas = Fasta::new(sample, &DNA);
        fas.parse();
        let key = String::from("CDEF");
        let res = String::from("ATGTAT");
        assert_eq!(Some(&res), fas.matrix.get(&key));
    }
}