deepbiop_fa/encode/
traits.rs

1use anyhow::Result;
2use log::info;
3use noodles::fasta;
4use std::{
5    io::BufReader,
6    path::{Path, PathBuf},
7};
8
9use super::record::RecordData;
10use deepbiop_utils as utils;
11use needletail::Sequence;
12
13pub trait Encoder {
14    type EncodeOutput;
15    type RecordOutput;
16
17    fn encode_multiple(&mut self, paths: &[PathBuf], parallel: bool) -> Self::EncodeOutput;
18    fn encode<P: AsRef<Path>>(&mut self, path: P) -> Self::EncodeOutput;
19    fn encode_record(&self, id: &[u8], seq: &[u8]) -> Self::RecordOutput;
20
21    fn fetch_records<P: AsRef<Path>>(&mut self, path: P) -> Result<Vec<RecordData>> {
22        info!("fetching records from {}", path.as_ref().display());
23        let reader = utils::io::create_reader_for_compressed_file(path)?;
24        let mut reader = fasta::io::Reader::new(BufReader::new(reader));
25
26        let records: Vec<RecordData> = reader
27            .records()
28            .filter_map(|record| {
29                let record = record.ok()?;
30
31                let id = record.definition().name();
32                let seq: &[u8] = record.sequence().as_ref();
33                let normalized_seq = seq.normalize(false);
34                Some((id.to_vec(), normalized_seq.to_vec()).into())
35            })
36            .collect();
37        info!("total records: {}", records.len());
38        Ok(records)
39    }
40}