1use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufReader, Read};
6use std::path::{Path, PathBuf};
7
8use sbol::SbolObject;
9use sbol::constants::{EDAM_IUPAC_DNA, EDAM_IUPAC_PROTEIN, SBO_DNA, SBO_PROTEIN, SBO_RNA};
10use sbol::{BuildError, Component, Document, Iri, Sequence as SbolSequence};
11
12use crate::alphabet::Alphabet;
13use crate::parser::{Record, parse_records};
14
15#[derive(Clone, Debug)]
26pub struct FastaImporter {
27 namespace: Iri,
28 forced_alphabet: Option<Alphabet>,
29}
30
31impl FastaImporter {
32 pub fn new(namespace: impl AsRef<str>) -> Result<Self, ImportError> {
34 let namespace = Iri::new(namespace.as_ref().to_owned())
35 .map_err(|err| ImportError::Namespace(err.to_string()))?;
36 Ok(Self {
37 namespace,
38 forced_alphabet: None,
39 })
40 }
41
42 pub fn with_alphabet(mut self, alphabet: Alphabet) -> Self {
45 self.forced_alphabet = Some(alphabet);
46 self
47 }
48
49 pub fn read<R: Read>(&self, mut reader: R) -> Result<(Document, ImportReport), ImportError> {
53 let mut buffer = String::new();
54 reader
55 .read_to_string(&mut buffer)
56 .map_err(|err| ImportError::Io {
57 path: PathBuf::from("<reader>"),
58 source: err,
59 })?;
60 self.read_str(&buffer)
61 }
62
63 pub fn read_str(&self, input: &str) -> Result<(Document, ImportReport), ImportError> {
65 let records = parse_records(input);
66 if records.is_empty() {
67 return Err(ImportError::Empty);
68 }
69
70 let mut objects: Vec<SbolObject> = Vec::new();
71 let mut report = ImportReport::default();
72 let mut used_display_ids: HashSet<String> = HashSet::new();
73
74 for (index, record) in records.iter().enumerate() {
75 self.append_record(
76 record,
77 index,
78 &mut used_display_ids,
79 &mut objects,
80 &mut report,
81 )?;
82 }
83
84 let document = Document::from_objects(objects).map_err(ImportError::Build)?;
85 Ok((document, report))
86 }
87
88 pub fn read_path(
91 &self,
92 path: impl AsRef<Path>,
93 ) -> Result<(Document, ImportReport), ImportError> {
94 let path = path.as_ref();
95 let file = File::open(path).map_err(|err| ImportError::Io {
96 path: path.to_path_buf(),
97 source: err,
98 })?;
99 self.read(BufReader::new(file))
100 }
101
102 fn append_record(
103 &self,
104 record: &Record,
105 index: usize,
106 used_display_ids: &mut HashSet<String>,
107 objects: &mut Vec<SbolObject>,
108 report: &mut ImportReport,
109 ) -> Result<(), ImportError> {
110 let raw_id = if record.id.is_empty() {
111 format!("record_{index}")
112 } else {
113 record.id.clone()
114 };
115 let base_display_id = sanitize_display_id(&raw_id);
116 let display_id = dedupe(base_display_id, used_display_ids);
117
118 if record.sequence.is_empty() {
119 report.warnings.push(ImportWarning::EmptyRecord {
120 record_id: raw_id.clone(),
121 });
122 }
123
124 let alphabet = self
125 .forced_alphabet
126 .unwrap_or_else(|| Alphabet::detect(&record.sequence));
127 let (component_type, encoding, elements_case) = match alphabet {
128 Alphabet::Dna => (SBO_DNA, EDAM_IUPAC_DNA, ElementsCase::Lower),
129 Alphabet::Rna => (SBO_RNA, EDAM_IUPAC_DNA, ElementsCase::Lower),
130 Alphabet::Protein => (SBO_PROTEIN, EDAM_IUPAC_PROTEIN, ElementsCase::Upper),
131 };
132
133 let sequence_display_id = format!("{display_id}_sequence");
135 let mut sequence_builder =
136 SbolSequence::builder(self.namespace.as_str(), sequence_display_id.as_str())
137 .map_err(ImportError::Build)?;
138 if !record.sequence.is_empty() {
139 let elements = match elements_case {
140 ElementsCase::Lower => record.sequence.to_ascii_lowercase(),
141 ElementsCase::Upper => record.sequence.to_ascii_uppercase(),
142 };
143 sequence_builder = sequence_builder.elements(elements);
144 }
145 sequence_builder = sequence_builder.encoding(encoding);
146 let sequence = sequence_builder.build().map_err(ImportError::Build)?;
147 let sequence_resource = sequence.identity.clone();
148 objects.push(SbolObject::Sequence(sequence));
149 report.sequences += 1;
150
151 let mut component_builder =
153 Component::builder(self.namespace.as_str(), display_id.as_str())
154 .map_err(ImportError::Build)?;
155 component_builder = component_builder.types([component_type]);
156 if let Some(description) = record.description.as_deref().map(str::trim)
157 && !description.is_empty()
158 {
159 component_builder = component_builder.name(&raw_id).description(description);
164 } else {
165 component_builder = component_builder.name(&raw_id);
166 }
167 component_builder = component_builder.add_sequence(sequence_resource);
168 let component = component_builder.build().map_err(ImportError::Build)?;
169 objects.push(SbolObject::Component(component));
170 report.components += 1;
171
172 match alphabet {
173 Alphabet::Dna => report.dna_records += 1,
174 Alphabet::Rna => report.rna_records += 1,
175 Alphabet::Protein => report.protein_records += 1,
176 }
177
178 Ok(())
179 }
180}
181
182#[derive(Clone, Copy)]
183enum ElementsCase {
184 Lower,
185 Upper,
186}
187
188fn dedupe(base: String, used: &mut HashSet<String>) -> String {
189 if used.insert(base.clone()) {
190 return base;
191 }
192 for suffix in 2.. {
193 let candidate = format!("{base}_{suffix}");
194 if used.insert(candidate.clone()) {
195 return candidate;
196 }
197 }
198 unreachable!("display ID space exhausted");
199}
200
201fn sanitize_display_id(raw: &str) -> String {
202 let mut out = String::with_capacity(raw.len());
203 for c in raw.chars() {
204 if c.is_ascii_alphanumeric() || c == '_' {
205 out.push(c);
206 } else {
207 out.push('_');
208 }
209 }
210 if out.is_empty() {
211 return "record".to_owned();
212 }
213 if out
214 .chars()
215 .next()
216 .map(|c| c.is_ascii_digit())
217 .unwrap_or(false)
218 {
219 out.insert(0, '_');
220 }
221 out
222}
223
224#[derive(Clone, Debug, Default)]
226#[non_exhaustive]
227pub struct ImportReport {
228 pub components: usize,
229 pub sequences: usize,
230 pub dna_records: usize,
231 pub rna_records: usize,
232 pub protein_records: usize,
233 pub warnings: Vec<ImportWarning>,
234}
235
236impl ImportReport {
237 pub fn is_clean(&self) -> bool {
238 self.warnings.is_empty()
239 }
240}
241
242#[derive(Clone, Debug, PartialEq, Eq)]
244#[non_exhaustive]
245pub enum ImportWarning {
246 EmptyRecord { record_id: String },
250}
251
252#[derive(Debug)]
254#[non_exhaustive]
255pub enum ImportError {
256 Namespace(String),
258 Empty,
260 Io {
262 path: PathBuf,
263 source: std::io::Error,
264 },
265 Build(BuildError),
268}
269
270impl std::fmt::Display for ImportError {
271 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
272 match self {
273 Self::Namespace(msg) => write!(f, "invalid namespace: {msg}"),
274 Self::Empty => write!(
275 f,
276 "input contained no `>` records — was the file truncated?"
277 ),
278 Self::Io { path, source } => {
279 write!(f, "failed to read {}: {source}", path.display())
280 }
281 Self::Build(err) => write!(f, "SBOL object construction failed: {err}"),
282 }
283 }
284}
285
286impl std::error::Error for ImportError {
287 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
288 match self {
289 Self::Io { source, .. } => Some(source),
290 Self::Build(err) => Some(err),
291 _ => None,
292 }
293 }
294}