use std::collections::HashSet;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::{Path, PathBuf};
use sbol::SbolObject;
use sbol::constants::{EDAM_IUPAC_DNA, EDAM_IUPAC_PROTEIN, SBO_DNA, SBO_PROTEIN, SBO_RNA};
use sbol::{BuildError, Component, Document, Iri, Sequence as SbolSequence};
use crate::alphabet::Alphabet;
use crate::parser::{Record, parse_records};
#[derive(Clone, Debug)]
pub struct FastaImporter {
namespace: Iri,
forced_alphabet: Option<Alphabet>,
}
impl FastaImporter {
pub fn new(namespace: impl AsRef<str>) -> Result<Self, ImportError> {
let namespace = Iri::new(namespace.as_ref().to_owned())
.map_err(|err| ImportError::Namespace(err.to_string()))?;
Ok(Self {
namespace,
forced_alphabet: None,
})
}
pub fn with_alphabet(mut self, alphabet: Alphabet) -> Self {
self.forced_alphabet = Some(alphabet);
self
}
pub fn read<R: Read>(&self, mut reader: R) -> Result<(Document, ImportReport), ImportError> {
let mut buffer = String::new();
reader
.read_to_string(&mut buffer)
.map_err(|err| ImportError::Io {
path: PathBuf::from("<reader>"),
source: err,
})?;
self.read_str(&buffer)
}
pub fn read_str(&self, input: &str) -> Result<(Document, ImportReport), ImportError> {
let records = parse_records(input);
if records.is_empty() {
return Err(ImportError::Empty);
}
let mut objects: Vec<SbolObject> = Vec::new();
let mut report = ImportReport::default();
let mut used_display_ids: HashSet<String> = HashSet::new();
for (index, record) in records.iter().enumerate() {
self.append_record(
record,
index,
&mut used_display_ids,
&mut objects,
&mut report,
)?;
}
let document = Document::from_objects(objects).map_err(ImportError::Build)?;
Ok((document, report))
}
pub fn read_path(
&self,
path: impl AsRef<Path>,
) -> Result<(Document, ImportReport), ImportError> {
let path = path.as_ref();
let file = File::open(path).map_err(|err| ImportError::Io {
path: path.to_path_buf(),
source: err,
})?;
self.read(BufReader::new(file))
}
fn append_record(
&self,
record: &Record,
index: usize,
used_display_ids: &mut HashSet<String>,
objects: &mut Vec<SbolObject>,
report: &mut ImportReport,
) -> Result<(), ImportError> {
let raw_id = if record.id.is_empty() {
format!("record_{index}")
} else {
record.id.clone()
};
let base_display_id = sanitize_display_id(&raw_id);
let display_id = dedupe(base_display_id, used_display_ids);
if record.sequence.is_empty() {
report.warnings.push(ImportWarning::EmptyRecord {
record_id: raw_id.clone(),
});
}
let alphabet = self
.forced_alphabet
.unwrap_or_else(|| Alphabet::detect(&record.sequence));
let (component_type, encoding, elements_case) = match alphabet {
Alphabet::Dna => (SBO_DNA, EDAM_IUPAC_DNA, ElementsCase::Lower),
Alphabet::Rna => (SBO_RNA, EDAM_IUPAC_DNA, ElementsCase::Lower),
Alphabet::Protein => (SBO_PROTEIN, EDAM_IUPAC_PROTEIN, ElementsCase::Upper),
};
let sequence_display_id = format!("{display_id}_sequence");
let mut sequence_builder =
SbolSequence::builder(self.namespace.as_str(), sequence_display_id.as_str())
.map_err(ImportError::Build)?;
if !record.sequence.is_empty() {
let elements = match elements_case {
ElementsCase::Lower => record.sequence.to_ascii_lowercase(),
ElementsCase::Upper => record.sequence.to_ascii_uppercase(),
};
sequence_builder = sequence_builder.elements(elements);
}
sequence_builder = sequence_builder.encoding(encoding);
let sequence = sequence_builder.build().map_err(ImportError::Build)?;
let sequence_resource = sequence.identity.clone();
objects.push(SbolObject::Sequence(sequence));
report.sequences += 1;
let mut component_builder =
Component::builder(self.namespace.as_str(), display_id.as_str())
.map_err(ImportError::Build)?;
component_builder = component_builder.types([component_type]);
if let Some(description) = record.description.as_deref().map(str::trim)
&& !description.is_empty()
{
component_builder = component_builder.name(&raw_id).description(description);
} else {
component_builder = component_builder.name(&raw_id);
}
component_builder = component_builder.add_sequence(sequence_resource);
let component = component_builder.build().map_err(ImportError::Build)?;
objects.push(SbolObject::Component(component));
report.components += 1;
match alphabet {
Alphabet::Dna => report.dna_records += 1,
Alphabet::Rna => report.rna_records += 1,
Alphabet::Protein => report.protein_records += 1,
}
Ok(())
}
}
#[derive(Clone, Copy)]
enum ElementsCase {
Lower,
Upper,
}
fn dedupe(base: String, used: &mut HashSet<String>) -> String {
if used.insert(base.clone()) {
return base;
}
for suffix in 2.. {
let candidate = format!("{base}_{suffix}");
if used.insert(candidate.clone()) {
return candidate;
}
}
unreachable!("display ID space exhausted");
}
fn sanitize_display_id(raw: &str) -> String {
let mut out = String::with_capacity(raw.len());
for c in raw.chars() {
if c.is_ascii_alphanumeric() || c == '_' {
out.push(c);
} else {
out.push('_');
}
}
if out.is_empty() {
return "record".to_owned();
}
if out
.chars()
.next()
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
{
out.insert(0, '_');
}
out
}
#[derive(Clone, Debug, Default)]
#[non_exhaustive]
pub struct ImportReport {
pub components: usize,
pub sequences: usize,
pub dna_records: usize,
pub rna_records: usize,
pub protein_records: usize,
pub warnings: Vec<ImportWarning>,
}
impl ImportReport {
pub fn is_clean(&self) -> bool {
self.warnings.is_empty()
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum ImportWarning {
EmptyRecord { record_id: String },
}
#[derive(Debug)]
#[non_exhaustive]
pub enum ImportError {
Namespace(String),
Empty,
Io {
path: PathBuf,
source: std::io::Error,
},
Build(BuildError),
}
impl std::fmt::Display for ImportError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Namespace(msg) => write!(f, "invalid namespace: {msg}"),
Self::Empty => write!(
f,
"input contained no `>` records — was the file truncated?"
),
Self::Io { path, source } => {
write!(f, "failed to read {}: {source}", path.display())
}
Self::Build(err) => write!(f, "SBOL object construction failed: {err}"),
}
}
}
impl std::error::Error for ImportError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Io { source, .. } => Some(source),
Self::Build(err) => Some(err),
_ => None,
}
}
}