segul 0.23.2

An ultrafast and memory-efficient tool for phylogenomics
Documentation
use std::path::{Path, PathBuf};

use colored::Colorize;
use rayon::prelude::*;
use regex::Regex;

use crate::core::OutputPrint;
use crate::helper::files;
use crate::helper::finder::IDs;
use crate::helper::sequence::{SeqCheck, SeqParser};
use crate::helper::types::{DataType, Header, InputFmt, OutputFmt, SeqMatrix};
use crate::helper::utils;
use crate::writer::sequences::SeqWriter;

impl OutputPrint for SequenceRemoval<'_> {}

pub enum SeqRemovalParameters {
    Id(Vec<String>),
    Regex(String),
}

pub struct SequenceRemoval<'a> {
    input_fmt: &'a InputFmt,
    datatype: &'a DataType,
    output_dir: &'a Path,
    output_fmt: &'a OutputFmt,
    opts: &'a SeqRemovalParameters,
}

impl<'a> SequenceRemoval<'a> {
    pub fn new(
        input_fmt: &'a InputFmt,
        datatype: &'a DataType,
        output_dir: &'a Path,
        output_fmt: &'a OutputFmt,
        opts: &'a SeqRemovalParameters,
    ) -> Self {
        Self {
            input_fmt,
            datatype,
            output_dir,
            output_fmt,
            opts,
        }
    }

    pub fn remove(&self, files: &[PathBuf]) {
        let spin = utils::set_spinner();
        spin.set_message("Removing sequences...");
        match self.opts {
            SeqRemovalParameters::Id(ids) => self.par_remove(files, ids),
            SeqRemovalParameters::Regex(re) => {
                let ids = self.find_matching_ids(files, re);
                self.par_remove(files, &ids);
            }
        }
        spin.finish_with_message("Finished removing sequences!\n");
        self.print_output_info();
    }

    fn find_matching_ids(&self, files: &[PathBuf], re: &str) -> Vec<String> {
        let ids = IDs::new(files, self.input_fmt, self.datatype).id_unique();
        let re = Regex::new(re).expect("Failed parsing regex");
        let mut matching_ids = Vec::with_capacity(ids.len());
        ids.iter().for_each(|id| {
            if re.is_match(id) {
                matching_ids.push(id.to_string());
            }
        });
        matching_ids.shrink_to_fit();
        matching_ids
    }

    fn par_remove(&self, files: &[PathBuf], ids: &[String]) {
        files.par_iter().for_each(|file| {
            let (matrix, header) = self.remove_sequence(file, ids);
            if !matrix.is_empty() {
                self.write_output(&matrix, &header, file);
            }
        })
    }

    fn write_output(&self, matrix: &SeqMatrix, header: &Header, file: &Path) {
        let output_path = files::create_output_fname(self.output_dir, file, self.output_fmt);
        let mut writer = SeqWriter::new(&output_path, matrix, header);
        writer
            .write_sequence(self.output_fmt)
            .expect("Failed writing output sequence");
    }

    fn remove_sequence(&self, fpath: &Path, ids: &[String]) -> (SeqMatrix, Header) {
        let (mut matrix, header) = SeqParser::new(fpath, self.datatype).parse(self.input_fmt);
        ids.iter()
            .for_each(|id| if matrix.shift_remove(id).is_some() {});

        let fnl_header = if !matrix.is_empty() && header.ntax != matrix.len() {
            self.get_header(&matrix)
        } else {
            header
        };
        (matrix, fnl_header)
    }

    fn get_header(&self, matrix: &SeqMatrix) -> Header {
        let mut seq_info = SeqCheck::new();
        seq_info.check(matrix);
        let mut header = Header::new();
        header.aligned = seq_info.is_alignment;
        header.nchar = seq_info.longest;
        header.ntax = matrix.len();
        header
    }

    fn print_output_info(&self) {
        log::info!("{}", "Output".yellow());
        log::info!("{:18}: {}", "Output dir", self.output_dir.display());
        self.print_output_fmt(self.output_fmt);
    }
}

#[cfg(test)]
mod test {
    use super::*;

    macro_rules! input {
        ($remove: ident) => {
            let input_fmt = InputFmt::Fasta;
            let datatype = DataType::Dna;

            let opts = SeqRemovalParameters::Regex(String::from("^abc"));
            let output_dir = Path::new(".");
            let output_fmt = OutputFmt::Fasta;
            let $remove =
                SequenceRemoval::new(&input_fmt, &datatype, output_dir, &output_fmt, &opts);
        };
    }

    #[test]
    fn test_remove_seq() {
        let ids = vec![String::from("ABCD")];
        input!(remove);
        let file = Path::new("tests/files/simple.fas");
        let (_, header) = remove.remove_sequence(file, &ids);
        assert_eq!(header.ntax, 1);
    }

    #[test]
    fn test_remove_regex() {
        let re = String::from("(?i)^abc");
        input!(remove);
        let file = PathBuf::from("tests/files/simple.fas");
        let files = [file.clone()];
        let ids = remove.find_matching_ids(&files, &re);
        let (_, header) = remove.remove_sequence(&file, &ids);
        assert_eq!(header.ntax, 1);
    }
}