use std::{io::Read, fs::File};
use thiserror::Error;
use sigalign_core::reference::Reference as RawReference;
use sigalign_impl::{
pattern_index::dynamic_lfi::{
DynamicLfiOption, LfiBuildError,
},
sequence_storage::in_memory::InMemoryStorage,
};
use super::Reference;
pub struct ReferenceBuilder {
uppercase: bool,
to_ignore_bases: Vec<u8>,
sequence_storage: InMemoryStorage,
}
#[derive(Error, Debug)]
pub enum ReferenceBuildError {
#[error(transparent)]
PatternIndexError(#[from] LfiBuildError),
#[error("Invalid input: {0}")]
InvalidSequence(String),
#[error(transparent)]
IoError(#[from] std::io::Error),
#[error("Sequence is empty")]
EmptySequence,
}
impl ReferenceBuilder {
pub fn new() -> Self {
Self {
uppercase: true,
to_ignore_bases: Vec::new(),
sequence_storage: InMemoryStorage::new(),
}
}
pub fn set_uppercase(mut self, uppercase: bool) -> Self {
self.uppercase = uppercase;
self
}
pub fn ignore_base(mut self, base: u8) -> Self {
self.to_ignore_bases.push(base);
self
}
pub fn ignore_bases(mut self, bases: &[u8]) -> Self {
self.to_ignore_bases.extend_from_slice(bases);
self
}
pub fn reset_ignore_bases(mut self) -> Self {
self.to_ignore_bases.clear();
self
}
pub fn add_target(mut self, label: &str, sequence: &[u8]) -> Self {
self.sequence_storage.add_target(label, sequence);
self
}
pub fn add_fasta<R: Read>(mut self, reader: R) -> Result<Self, ReferenceBuildError> {
self.sequence_storage.add_fasta(reader).map_err(|_| ReferenceBuildError::invalid_fasta_record())?;
Ok(self)
}
pub fn add_fasta_file<P>(mut self, path: P) -> Result<Self, ReferenceBuildError> where
P: AsRef<std::path::Path> + std::fmt::Debug,
{
let file = File::open(path)?;
self.sequence_storage.add_fasta(file).map_err(|_| ReferenceBuildError::invalid_fasta_record())?;
Ok(self)
}
pub fn build(mut self) -> Result<Reference, ReferenceBuildError> {
if self.uppercase {
self.sequence_storage.set_sequences_to_uppercase()
}
if !self.to_ignore_bases.is_empty() {
self.sequence_storage.change_bases_to(&self.to_ignore_bases, b'?');
}
let dynamic_lfi_option = Self::get_option_for_dynamic_lfi(&self.sequence_storage);
let raw_reference = RawReference::new(
self.sequence_storage,
dynamic_lfi_option,
)?;
Ok(Reference::from(raw_reference))
}
fn get_option_for_dynamic_lfi(sequence_storage: &InMemoryStorage) -> DynamicLfiOption {
let total_length = sequence_storage.get_total_length();
let lookup_table_max_bytes_size = u64::min(
200 * 1024 * 1024,
(total_length / 8) as u64,
);
DynamicLfiOption {
suffix_array_sampling_ratio: 1,
lookup_table_max_bytes_size,
use_safe_guard: true,
}
}
}
impl ReferenceBuildError {
fn invalid_fasta_record() -> Self {
Self::InvalidSequence("ID of FASTA record is invalid UTF8".to_string())
}
}