Skip to main content

sbol_fasta/
importer.rs

1//! FASTA → SBOL 3 conversion engine.
2
3use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufReader, Read};
6use std::path::{Path, PathBuf};
7
8use sbol::SbolObject;
9use sbol::constants::{EDAM_IUPAC_DNA, EDAM_IUPAC_PROTEIN, SBO_DNA, SBO_PROTEIN, SBO_RNA};
10use sbol::{BuildError, Component, Document, Iri, Sequence as SbolSequence};
11
12use crate::alphabet::Alphabet;
13use crate::parser::{Record, parse_records};
14
15/// Imports FASTA records and emits SBOL 3 [`Document`]s.
16///
17/// `FastaImporter::new` takes the namespace IRI that the resulting
18/// SBOL 3 top-level objects will be rooted under — typically the
19/// owning lab or repository (e.g. `https://example.org/lab`).
20/// Component identities are derived as `{namespace}/{record-id}`.
21///
22/// By default, the alphabet of each record is detected automatically
23/// from the sequence text. Call [`FastaImporter::with_alphabet`] to
24/// override that detection when the data is ambiguous.
25#[derive(Clone, Debug)]
26pub struct FastaImporter {
27    namespace: Iri,
28    forced_alphabet: Option<Alphabet>,
29}
30
31impl FastaImporter {
32    /// Builds a new importer scoped to the supplied namespace IRI.
33    pub fn new(namespace: impl AsRef<str>) -> Result<Self, ImportError> {
34        let namespace = Iri::new(namespace.as_ref().to_owned())
35            .map_err(|err| ImportError::Namespace(err.to_string()))?;
36        Ok(Self {
37            namespace,
38            forced_alphabet: None,
39        })
40    }
41
42    /// Skips alphabet detection and forces every record to be
43    /// imported as the supplied [`Alphabet`].
44    pub fn with_alphabet(mut self, alphabet: Alphabet) -> Self {
45        self.forced_alphabet = Some(alphabet);
46        self
47    }
48
49    /// Reads every record from the supplied reader and returns one
50    /// SBOL 3 [`Document`] containing the emitted Components +
51    /// Sequences plus an [`ImportReport`] tallying what was produced.
52    pub fn read<R: Read>(&self, mut reader: R) -> Result<(Document, ImportReport), ImportError> {
53        let mut buffer = String::new();
54        reader
55            .read_to_string(&mut buffer)
56            .map_err(|err| ImportError::Io {
57                path: PathBuf::from("<reader>"),
58                source: err,
59            })?;
60        self.read_str(&buffer)
61    }
62
63    /// Reads from a string slice.
64    pub fn read_str(&self, input: &str) -> Result<(Document, ImportReport), ImportError> {
65        let records = parse_records(input);
66        if records.is_empty() {
67            return Err(ImportError::Empty);
68        }
69
70        let mut objects: Vec<SbolObject> = Vec::new();
71        let mut report = ImportReport::default();
72        let mut used_display_ids: HashSet<String> = HashSet::new();
73
74        for (index, record) in records.iter().enumerate() {
75            self.append_record(
76                record,
77                index,
78                &mut used_display_ids,
79                &mut objects,
80                &mut report,
81            )?;
82        }
83
84        let document = Document::from_objects(objects).map_err(ImportError::Build)?;
85        Ok((document, report))
86    }
87
88    /// Reads from a file on disk (`.fasta` / `.fa` / `.fna` / `.faa`
89    /// — the importer doesn't actually care about the extension).
90    pub fn read_path(
91        &self,
92        path: impl AsRef<Path>,
93    ) -> Result<(Document, ImportReport), ImportError> {
94        let path = path.as_ref();
95        let file = File::open(path).map_err(|err| ImportError::Io {
96            path: path.to_path_buf(),
97            source: err,
98        })?;
99        self.read(BufReader::new(file))
100    }
101
102    fn append_record(
103        &self,
104        record: &Record,
105        index: usize,
106        used_display_ids: &mut HashSet<String>,
107        objects: &mut Vec<SbolObject>,
108        report: &mut ImportReport,
109    ) -> Result<(), ImportError> {
110        let raw_id = if record.id.is_empty() {
111            format!("record_{index}")
112        } else {
113            record.id.clone()
114        };
115        let base_display_id = sanitize_display_id(&raw_id);
116        let display_id = dedupe(base_display_id, used_display_ids);
117
118        if record.sequence.is_empty() {
119            report.warnings.push(ImportWarning::EmptyRecord {
120                record_id: raw_id.clone(),
121            });
122        }
123
124        let alphabet = self
125            .forced_alphabet
126            .unwrap_or_else(|| Alphabet::detect(&record.sequence));
127        let (component_type, encoding, elements_case) = match alphabet {
128            Alphabet::Dna => (SBO_DNA, EDAM_IUPAC_DNA, ElementsCase::Lower),
129            Alphabet::Rna => (SBO_RNA, EDAM_IUPAC_DNA, ElementsCase::Lower),
130            Alphabet::Protein => (SBO_PROTEIN, EDAM_IUPAC_PROTEIN, ElementsCase::Upper),
131        };
132
133        // Sequence
134        let sequence_display_id = format!("{display_id}_sequence");
135        let mut sequence_builder =
136            SbolSequence::builder(self.namespace.as_str(), sequence_display_id.as_str())
137                .map_err(ImportError::Build)?;
138        if !record.sequence.is_empty() {
139            let elements = match elements_case {
140                ElementsCase::Lower => record.sequence.to_ascii_lowercase(),
141                ElementsCase::Upper => record.sequence.to_ascii_uppercase(),
142            };
143            sequence_builder = sequence_builder.elements(elements);
144        }
145        sequence_builder = sequence_builder.encoding(encoding);
146        let sequence = sequence_builder.build().map_err(ImportError::Build)?;
147        let sequence_resource = sequence.identity.clone();
148        objects.push(SbolObject::Sequence(sequence));
149        report.sequences += 1;
150
151        // Component
152        let mut component_builder =
153            Component::builder(self.namespace.as_str(), display_id.as_str())
154                .map_err(ImportError::Build)?;
155        component_builder = component_builder.types([component_type]);
156        if let Some(description) = record.description.as_deref().map(str::trim)
157            && !description.is_empty()
158        {
159            // Headers commonly read like `>id description text…`. We
160            // use the original record id (not the sanitized display
161            // id) as a human-friendly name, with the description
162            // following.
163            component_builder = component_builder.name(&raw_id).description(description);
164        } else {
165            component_builder = component_builder.name(&raw_id);
166        }
167        component_builder = component_builder.add_sequence(sequence_resource);
168        let component = component_builder.build().map_err(ImportError::Build)?;
169        objects.push(SbolObject::Component(component));
170        report.components += 1;
171
172        match alphabet {
173            Alphabet::Dna => report.dna_records += 1,
174            Alphabet::Rna => report.rna_records += 1,
175            Alphabet::Protein => report.protein_records += 1,
176        }
177
178        Ok(())
179    }
180}
181
182#[derive(Clone, Copy)]
183enum ElementsCase {
184    Lower,
185    Upper,
186}
187
188fn dedupe(base: String, used: &mut HashSet<String>) -> String {
189    if used.insert(base.clone()) {
190        return base;
191    }
192    for suffix in 2.. {
193        let candidate = format!("{base}_{suffix}");
194        if used.insert(candidate.clone()) {
195            return candidate;
196        }
197    }
198    unreachable!("display ID space exhausted");
199}
200
201fn sanitize_display_id(raw: &str) -> String {
202    let mut out = String::with_capacity(raw.len());
203    for c in raw.chars() {
204        if c.is_ascii_alphanumeric() || c == '_' {
205            out.push(c);
206        } else {
207            out.push('_');
208        }
209    }
210    if out.is_empty() {
211        return "record".to_owned();
212    }
213    if out
214        .chars()
215        .next()
216        .map(|c| c.is_ascii_digit())
217        .unwrap_or(false)
218    {
219        out.insert(0, '_');
220    }
221    out
222}
223
224/// Tally of what a [`FastaImporter`] run produced.
225#[derive(Clone, Debug, Default)]
226#[non_exhaustive]
227pub struct ImportReport {
228    pub components: usize,
229    pub sequences: usize,
230    pub dna_records: usize,
231    pub rna_records: usize,
232    pub protein_records: usize,
233    pub warnings: Vec<ImportWarning>,
234}
235
236impl ImportReport {
237    pub fn is_clean(&self) -> bool {
238        self.warnings.is_empty()
239    }
240}
241
242/// Non-fatal issues encountered while importing FASTA.
243#[derive(Clone, Debug, PartialEq, Eq)]
244#[non_exhaustive]
245pub enum ImportWarning {
246    /// A record had a header but no sequence body. The Component +
247    /// Sequence are still emitted; the Sequence simply has no
248    /// `elements`.
249    EmptyRecord { record_id: String },
250}
251
252/// Fatal errors from [`FastaImporter`].
253#[derive(Debug)]
254#[non_exhaustive]
255pub enum ImportError {
256    /// The namespace IRI was invalid.
257    Namespace(String),
258    /// The input contained no `>` records.
259    Empty,
260    /// Filesystem read failure (for [`FastaImporter::read_path`]).
261    Io {
262        path: PathBuf,
263        source: std::io::Error,
264    },
265    /// SBOL 3 object construction failed (typically an invalid
266    /// displayId or namespace).
267    Build(BuildError),
268}
269
270impl std::fmt::Display for ImportError {
271    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
272        match self {
273            Self::Namespace(msg) => write!(f, "invalid namespace: {msg}"),
274            Self::Empty => write!(
275                f,
276                "input contained no `>` records — was the file truncated?"
277            ),
278            Self::Io { path, source } => {
279                write!(f, "failed to read {}: {source}", path.display())
280            }
281            Self::Build(err) => write!(f, "SBOL object construction failed: {err}"),
282        }
283    }
284}
285
286impl std::error::Error for ImportError {
287    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
288        match self {
289            Self::Io { source, .. } => Some(source),
290            Self::Build(err) => Some(err),
291            _ => None,
292        }
293    }
294}