deepbiop_fa/encode/
traits.rs1use anyhow::Result;
2use log::info;
3use noodles::fasta;
4use std::{
5 io::BufReader,
6 path::{Path, PathBuf},
7};
8
9use super::record::RecordData;
10use deepbiop_utils as utils;
11use needletail::Sequence;
12
13pub trait Encoder {
14 type EncodeOutput;
15 type RecordOutput;
16
17 fn encode_multiple(&mut self, paths: &[PathBuf], parallel: bool) -> Self::EncodeOutput;
18 fn encode<P: AsRef<Path>>(&mut self, path: P) -> Self::EncodeOutput;
19 fn encode_record(&self, id: &[u8], seq: &[u8]) -> Self::RecordOutput;
20
21 fn fetch_records<P: AsRef<Path>>(&mut self, path: P) -> Result<Vec<RecordData>> {
22 info!("fetching records from {}", path.as_ref().display());
23 let reader = utils::io::create_reader_for_compressed_file(path)?;
24 let mut reader = fasta::io::Reader::new(BufReader::new(reader));
25
26 let records: Vec<RecordData> = reader
27 .records()
28 .filter_map(|record| {
29 let record = record.ok()?;
30
31 let id = record.definition().name();
32 let seq: &[u8] = record.sequence().as_ref();
33 let normalized_seq = seq.normalize(false);
34 Some((id.to_vec(), normalized_seq.to_vec()).into())
35 })
36 .collect();
37 info!("total records: {}", records.len());
38 Ok(records)
39 }
40}