Skip to main content

sbol_ontology/
lib.rs

1//! Offline ontology facts for SBOL validation.
2//!
3//! `sbol-ontology` embeds a compact, SBOL-specific fact snapshot derived from
4//! canonical ontology sources. It does not fetch network resources at runtime.
5//! The bundled [`Ontology`] accepts common SBOL document IRIs, OBO PURLs, and
6//! compact IDs, then exposes branch membership, conflict, and compatibility
7//! queries used by the `sbol` validator.
8//!
9//! Extension snapshots (e.g. NCIT) can be loaded from a TSV that follows the
10//! same column schema as the bundled file. Compose them with the bundled
11//! snapshot through [`OntologyRegistry`].
12
13#![forbid(unsafe_code)]
14
15pub mod cache;
16pub mod download;
17pub mod parser;
18
19pub use cache::{
20    BranchRoot, BuildError, InstallError, InstalledOntology, KnownOntology, OntologyCache,
21    OntologyDescriptor, SourceFormat, VerifyError,
22};
23
24use std::borrow::Cow;
25use std::collections::{BTreeMap, BTreeSet};
26use std::fs;
27use std::io;
28use std::path::Path;
29use std::sync::OnceLock;
30
31const FACTS: &str = include_str!("../data/sbol3_ontology_facts.tsv");
32const SOURCES: &str = include_str!("../data/ontology_sources.tsv");
33
34/// Current TSV format version. Snapshots that do not carry this version
35/// in a `# format_version: N` header line are rejected at load time.
36pub const TSV_FORMAT_VERSION: u32 = 1;
37
38/// Broad family for SBOL Component type terms.
39#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
40pub enum ComponentTypeFamily {
41    NucleicAcid,
42    Protein,
43    SimpleChemical,
44    Complex,
45    Functional,
46}
47
48/// Broad family for SBOL Sequence encoding terms.
49#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
50pub enum SequenceEncodingFamily {
51    NucleicAcid,
52    Protein,
53    SimpleChemical,
54    OtherTextual,
55}
56
57/// Ontology namespace represented by a bundled or extension term.
58#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
59pub enum OntologyNamespace {
60    Edam,
61    Sbo,
62    So,
63    Go,
64    Chebi,
65    Cl,
66    Ncit,
67}
68
69/// SBOL-facing role assigned to a bundled ontology term.
70#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
71pub enum TermRole {
72    SequenceEncoding,
73    ComponentType,
74    ComponentTypeModifier,
75    InteractionType,
76    ParticipationRole,
77    FeatureRole,
78    Other,
79}
80
81/// Provenance for one upstream ontology source.
82#[derive(Clone, Debug, PartialEq, Eq)]
83pub struct OntologyProvenance {
84    pub ontology: String,
85    pub source_url: String,
86    pub version: String,
87    pub license: String,
88    pub retrieved: String,
89    pub raw_sha256: String,
90    pub fact_sha256: String,
91    pub notes: String,
92}
93
94/// Offline ontology query surface used by SBOL validation.
95#[derive(Clone, Debug, PartialEq, Eq)]
96pub struct Ontology {
97    terms: BTreeMap<String, TermFact>,
98    aliases: BTreeMap<String, String>,
99    branches: BTreeSet<(String, String)>,
100    compatibilities: BTreeSet<(String, String)>,
101    conflicts: BTreeSet<(String, String)>,
102    component_role_terms: BTreeSet<String>,
103    component_role_compatibilities: BTreeSet<(String, String)>,
104    participation_compatibilities: BTreeSet<(String, String)>,
105    provenance: Vec<OntologyProvenance>,
106}
107
108#[derive(Clone, Debug, PartialEq, Eq)]
109struct TermFact {
110    iri: String,
111    label: String,
112    parents: Vec<String>,
113    namespace: OntologyNamespace,
114    role: TermRole,
115    component_family: Option<ComponentTypeFamily>,
116    sequence_family: Option<SequenceEncodingFamily>,
117    table_1_sequence_encoding: bool,
118    table_2_component_type: bool,
119}
120
121impl Ontology {
122    /// Returns the bundled offline ontology snapshot.
123    pub fn bundled() -> &'static Self {
124        static ONTOLOGY: OnceLock<Ontology> = OnceLock::new();
125        ONTOLOGY.get_or_init(|| {
126            Ontology::from_tsv(FACTS, SOURCES)
127                .expect("bundled SBOL ontology facts must parse successfully")
128        })
129    }
130
131    /// Returns provenance metadata for the upstream ontology sources.
132    pub fn provenance(&self) -> &[OntologyProvenance] {
133        &self.provenance
134    }
135
136    /// Returns the compact canonical ID for an IRI, PURL, or compact ID.
137    pub fn canonical_id(&self, term: &str) -> Option<String> {
138        if let Some(canonical) = self.aliases.get(term) {
139            return Some(canonical.clone());
140        }
141        let candidate = normalize_term_id(term)?;
142        self.terms.contains_key(&candidate).then_some(candidate)
143    }
144
145    /// Returns the preferred SBOL-facing IRI for a known term.
146    pub fn canonical_iri(&self, term: &str) -> Option<&str> {
147        let canonical = self.canonical_id(term)?;
148        self.terms.get(&canonical).map(|fact| fact.iri.as_str())
149    }
150
151    /// Returns true when the term exists in the bundled fact snapshot.
152    pub fn contains_term(&self, term: &str) -> bool {
153        self.canonical_id(term).is_some()
154    }
155
156    /// Returns the preferred label for a known term.
157    pub fn label(&self, term: &str) -> Option<&str> {
158        let canonical = self.canonical_id(term)?;
159        self.terms.get(&canonical).map(|fact| fact.label.as_str())
160    }
161
162    /// Returns the source ontology for a known term.
163    pub fn namespace(&self, term: &str) -> Option<OntologyNamespace> {
164        let canonical = self.canonical_id(term)?;
165        self.terms.get(&canonical).map(|fact| fact.namespace)
166    }
167
168    /// Returns the SBOL-facing role for a known term.
169    pub fn term_role(&self, term: &str) -> Option<TermRole> {
170        let canonical = self.canonical_id(term)?;
171        self.terms.get(&canonical).map(|fact| fact.role)
172    }
173
174    /// Returns whether a known term is a Sequence encoding term.
175    ///
176    /// `None` means the term is absent from the bundled facts.
177    pub fn is_sequence_encoding_term(&self, term: &str) -> Option<bool> {
178        self.term_role(term)
179            .map(|role| role == TermRole::SequenceEncoding)
180    }
181
182    /// Returns whether a known term is suitable for `sbol:type` on structural
183    /// SBOL entities.
184    ///
185    /// Component type modifiers such as topology and strand terms are accepted
186    /// because SBOL permits them as additional `type` values for DNA/RNA.
187    pub fn is_component_type_term(&self, term: &str) -> Option<bool> {
188        self.term_role(term).map(|role| {
189            matches!(
190                role,
191                TermRole::ComponentType | TermRole::ComponentTypeModifier
192            )
193        })
194    }
195
196    /// Returns whether a known term is suitable for `sbol:role` on Feature
197    /// objects.
198    pub fn is_feature_role_term(&self, term: &str) -> Option<bool> {
199        self.term_role(term)
200            .map(|role| role == TermRole::FeatureRole)
201    }
202
203    /// Returns whether a known term is suitable for `sbol:role` on Component
204    /// and Component-like Feature objects.
205    pub fn is_component_role_term(&self, term: &str) -> Option<bool> {
206        let canonical = self.canonical_id(term)?;
207        if self.component_role_terms.contains(&canonical) {
208            return Some(true);
209        }
210        self.terms
211            .get(&canonical)
212            .map(|fact| fact.role == TermRole::FeatureRole)
213    }
214
215    /// Returns whether a known role term is in the SO sequence feature branch.
216    pub fn is_sequence_feature_role_term(&self, term: &str) -> Option<bool> {
217        self.contains_term(term)
218            .then(|| self.is_in_branch(term, "SO:0000110"))
219    }
220
221    /// Returns whether a known term is a Cell Ontology cell type, i.e. is
222    /// equivalent to or descends from `CL:0000000`. Returns `None` for terms
223    /// absent from the bundled facts.
224    pub fn is_cell_type_term(&self, term: &str) -> Option<bool> {
225        self.contains_term(term)
226            .then(|| self.is_in_branch(term, "CL:0000000"))
227    }
228
229    /// Returns whether a known term is suitable for `sbol:type` on Interaction
230    /// objects.
231    pub fn is_interaction_type_term(&self, term: &str) -> Option<bool> {
232        self.term_role(term)
233            .map(|role| role == TermRole::InteractionType)
234    }
235
236    /// Returns whether a known term is suitable for `sbol:role` on
237    /// Participation objects.
238    pub fn is_participation_role_term(&self, term: &str) -> Option<bool> {
239        self.term_role(term)
240            .map(|role| role == TermRole::ParticipationRole)
241    }
242
243    /// Returns true for exact SBOL Table 1 Sequence encoding terms.
244    pub fn is_table_1_sequence_encoding(&self, term: &str) -> bool {
245        let Some(canonical) = self.canonical_id(term) else {
246            return false;
247        };
248        self.terms
249            .get(&canonical)
250            .is_some_and(|fact| fact.table_1_sequence_encoding)
251    }
252
253    /// Returns true for exact SBOL Table 2 Component type terms.
254    pub fn is_table_2_component_type(&self, term: &str) -> bool {
255        let Some(canonical) = self.canonical_id(term) else {
256            return false;
257        };
258        self.terms
259            .get(&canonical)
260            .is_some_and(|fact| fact.table_2_component_type)
261    }
262
263    /// Returns true if `term` is a strict descendant of `ancestor`.
264    pub fn is_descendant(&self, term: &str, ancestor: &str) -> bool {
265        let Some(term) = self.canonical_id(term) else {
266            return false;
267        };
268        let Some(ancestor) = self.canonical_id(ancestor) else {
269            return false;
270        };
271        if term == ancestor {
272            return false;
273        }
274        self.has_ancestor(&term, &ancestor)
275    }
276
277    /// Returns true if `term` is equal to or descends from `ancestor`.
278    pub fn is_equivalent_or_descendant(&self, term: &str, ancestor: &str) -> bool {
279        let Some(term) = self.canonical_id(term) else {
280            return false;
281        };
282        let Some(ancestor) = self.canonical_id(ancestor) else {
283            return false;
284        };
285        term == ancestor || self.has_ancestor(&term, &ancestor)
286    }
287
288    /// Alias for [`Ontology::is_equivalent_or_descendant`].
289    pub fn is_in_branch(&self, term: &str, branch_root: &str) -> bool {
290        let Some(term) = self.canonical_id(term) else {
291            return false;
292        };
293        let Some(branch_root) = self.canonical_id(branch_root) else {
294            return false;
295        };
296        term == branch_root
297            || self.branches.contains(&(term.clone(), branch_root.clone()))
298            || self.has_ancestor(&term, &branch_root)
299    }
300
301    /// Returns whether two known terms conflict.
302    ///
303    /// `None` means one or both terms are absent from the bundled facts.
304    pub fn terms_conflict(&self, left: &str, right: &str) -> Option<bool> {
305        let left = self.canonical_id(left)?;
306        let right = self.canonical_id(right)?;
307        if left == right {
308            return Some(false);
309        }
310        if self.conflicts.contains(&ordered_pair(&left, &right)) {
311            return Some(true);
312        }
313        let left_fact = self.terms.get(&left)?;
314        let right_fact = self.terms.get(&right)?;
315        if let (Some(left_family), Some(right_family)) =
316            (left_fact.component_family, right_fact.component_family)
317        {
318            return Some(left_family != right_family);
319        }
320        if let (Some(left_family), Some(right_family)) =
321            (left_fact.sequence_family, right_fact.sequence_family)
322        {
323            return Some(left_family != right_family);
324        }
325        Some(false)
326    }
327
328    /// Returns whether a Participation role is cross-listed for an Interaction
329    /// type in the bundled SBOL Table 11/Table 12 facts.
330    pub fn participation_role_compatible_with_interaction_type(
331        &self,
332        role: &str,
333        interaction_type: &str,
334    ) -> Option<bool> {
335        let role = self.canonical_id(role)?;
336        let interaction_type = self.canonical_id(interaction_type)?;
337        let role_fact = self.terms.get(&role)?;
338        let interaction_fact = self.terms.get(&interaction_type)?;
339        if role_fact.role != TermRole::ParticipationRole
340            || interaction_fact.role != TermRole::InteractionType
341        {
342            return None;
343        }
344        Some(
345            self.participation_compatibilities
346                .contains(&(interaction_type, role)),
347        )
348    }
349
350    /// Returns whether a Component role is compatible with a Component type.
351    ///
352    /// `None` means one or both terms are absent, or the terms do not have the
353    /// roles needed to answer this compatibility question.
354    pub fn component_role_compatible_with_component_type(
355        &self,
356        role: &str,
357        component_type: &str,
358    ) -> Option<bool> {
359        let role = self.canonical_id(role)?;
360        let component_type = self.canonical_id(component_type)?;
361        let role_fact = self.terms.get(&role)?;
362        let component_fact = self.terms.get(&component_type)?;
363        if role_fact.role != TermRole::FeatureRole || component_fact.role != TermRole::ComponentType
364        {
365            return None;
366        }
367        if self
368            .component_role_compatibilities
369            .contains(&(role.clone(), component_type.clone()))
370        {
371            return Some(true);
372        }
373
374        let component_family = component_fact.component_family?;
375        if self.is_in_branch(&role, "SO:0000110") {
376            return Some(component_family == ComponentTypeFamily::NucleicAcid);
377        }
378        if self.is_in_branch(&role, "GO:0003674") {
379            return Some(component_family == ComponentTypeFamily::Protein);
380        }
381        if self.is_in_branch(&role, "CHEBI:50906") {
382            return Some(component_family == ComponentTypeFamily::SimpleChemical);
383        }
384        None
385    }
386
387    /// Returns whether a Sequence encoding is compatible with a Component type.
388    ///
389    /// `None` means one or both terms are absent, or the terms do not have the
390    /// roles needed to answer this compatibility question.
391    pub fn encoding_compatible_with_component_type(
392        &self,
393        encoding: &str,
394        component_type: &str,
395    ) -> Option<bool> {
396        let encoding = self.canonical_id(encoding)?;
397        let component_type = self.canonical_id(component_type)?;
398        let encoding_fact = self.terms.get(&encoding)?;
399        let component_fact = self.terms.get(&component_type)?;
400        if encoding_fact.role != TermRole::SequenceEncoding
401            || component_fact.role != TermRole::ComponentType
402        {
403            return None;
404        }
405        if self
406            .compatibilities
407            .contains(&(encoding.clone(), component_type.clone()))
408        {
409            return Some(true);
410        }
411        let encoding_family = encoding_fact.sequence_family?;
412        let component_family = component_fact.component_family?;
413        Some(matches!(
414            (encoding_family, component_family),
415            (
416                SequenceEncodingFamily::NucleicAcid,
417                ComponentTypeFamily::NucleicAcid
418            ) | (
419                SequenceEncodingFamily::Protein,
420                ComponentTypeFamily::Protein
421            ) | (
422                SequenceEncodingFamily::SimpleChemical,
423                ComponentTypeFamily::SimpleChemical
424            )
425        ))
426    }
427
428    /// Returns the first Table 1 encoding compatible with a Component type.
429    pub fn recommended_sequence_encoding_for_component_type(
430        &self,
431        component_type: &str,
432    ) -> Option<&str> {
433        self.compatible_sequence_encodings_for_component_type(component_type)
434            .into_iter()
435            .next()
436    }
437
438    /// Returns all Table 1 encodings compatible with a Component type.
439    pub fn compatible_sequence_encodings_for_component_type(
440        &self,
441        component_type: &str,
442    ) -> Vec<&str> {
443        let Some(component_type) = self.canonical_id(component_type) else {
444            return Vec::new();
445        };
446        self.compatibilities
447            .iter()
448            .filter_map(|(encoding, compatible_component)| {
449                (compatible_component == &component_type)
450                    .then(|| self.terms.get(encoding).map(|fact| fact.iri.as_str()))
451                    .flatten()
452            })
453            .collect()
454    }
455
456    /// Returns the broad component family for a known term.
457    pub fn component_type_family(&self, component_type: &str) -> Option<ComponentTypeFamily> {
458        let canonical = self.canonical_id(component_type)?;
459        self.terms
460            .get(&canonical)
461            .and_then(|fact| fact.component_family)
462    }
463
464    /// Returns the broad sequence encoding family for a known term.
465    pub fn sequence_encoding_family(&self, encoding: &str) -> Option<SequenceEncodingFamily> {
466        let canonical = self.canonical_id(encoding)?;
467        self.terms
468            .get(&canonical)
469            .and_then(|fact| fact.sequence_family)
470    }
471
472    fn has_ancestor(&self, term: &str, ancestor: &str) -> bool {
473        let Some(fact) = self.terms.get(term) else {
474            return false;
475        };
476        fact.parents
477            .iter()
478            .any(|parent| parent == ancestor || self.has_ancestor(parent, ancestor))
479    }
480
481    /// Parses an ontology snapshot from a TSV string in the bundled format.
482    /// Provenance is left empty; use [`Ontology::set_provenance`] if you need
483    /// to attach metadata for diagnostic output.
484    pub fn from_tsv_str(facts: &str) -> Result<Self, String> {
485        Self::from_tsv(facts, "")
486    }
487
488    /// Parses an ontology snapshot from a TSV file on disk.
489    pub fn from_tsv_path(path: impl AsRef<Path>) -> Result<Self, io::Error> {
490        let path = path.as_ref();
491        let text = fs::read_to_string(path)?;
492        Self::from_tsv_str(&text)
493            .map_err(|message| io::Error::new(io::ErrorKind::InvalidData, message))
494    }
495
496    /// Replaces the snapshot's provenance entries.
497    pub fn set_provenance(&mut self, provenance: Vec<OntologyProvenance>) {
498        self.provenance = provenance;
499    }
500
501    /// Merges `other` into `self`. The current snapshot wins on every duplicate
502    /// term, alias, or compatibility row — extensions can add new facts but
503    /// cannot rewrite bundled ones. Provenance from `other` is appended.
504    pub fn extend_with(&mut self, other: Ontology) {
505        for (id, fact) in other.terms {
506            self.terms.entry(id).or_insert(fact);
507        }
508        for (alias, canonical) in other.aliases {
509            self.aliases.entry(alias).or_insert(canonical);
510        }
511        self.branches.extend(other.branches);
512        self.compatibilities.extend(other.compatibilities);
513        self.conflicts.extend(other.conflicts);
514        self.component_role_terms.extend(other.component_role_terms);
515        self.component_role_compatibilities
516            .extend(other.component_role_compatibilities);
517        self.participation_compatibilities
518            .extend(other.participation_compatibilities);
519        self.provenance.extend(other.provenance);
520    }
521
522    fn from_tsv(facts: &str, sources: &str) -> Result<Self, String> {
523        let mut ontology = Self {
524            terms: BTreeMap::new(),
525            aliases: BTreeMap::new(),
526            branches: BTreeSet::new(),
527            compatibilities: BTreeSet::new(),
528            conflicts: BTreeSet::new(),
529            component_role_terms: BTreeSet::new(),
530            component_role_compatibilities: BTreeSet::new(),
531            participation_compatibilities: BTreeSet::new(),
532            provenance: parse_sources(sources)?,
533        };
534
535        let mut format_version: Option<u32> = None;
536        for (line_number, line) in facts.lines().enumerate() {
537            let trimmed = line.trim_start();
538            if let Some(rest) = trimmed.strip_prefix("# format_version:") {
539                let value = rest.trim();
540                let parsed = value.parse::<u32>().map_err(|_| {
541                    format!(
542                        "ontology snapshot has unparseable format_version `{value}` on line {}",
543                        line_number + 1
544                    )
545                })?;
546                format_version = Some(parsed);
547                continue;
548            }
549            if line.trim().is_empty() || line.starts_with('#') {
550                continue;
551            }
552            let columns = line.split('\t').collect::<Vec<_>>();
553            match columns.first().copied() {
554                Some("term") => ontology.insert_term(&columns, line_number + 1)?,
555                Some("branch") => ontology.insert_branch(&columns, line_number + 1)?,
556                Some("compat") => ontology.insert_compatibility(&columns, line_number + 1)?,
557                Some("conflict") => ontology.insert_conflict(&columns, line_number + 1)?,
558                Some("component_role") => {
559                    ontology.insert_component_role_term(&columns, line_number + 1)?
560                }
561                Some("component_role_compat") => {
562                    ontology.insert_component_role_compatibility(&columns, line_number + 1)?
563                }
564                Some("participation_compat") => {
565                    ontology.insert_participation_compatibility(&columns, line_number + 1)?
566                }
567                Some(other) => {
568                    return Err(format!(
569                        "unknown ontology fact kind `{other}` on line {line_number}"
570                    ));
571                }
572                None => {}
573            }
574        }
575
576        match format_version {
577            Some(version) if version == TSV_FORMAT_VERSION => Ok(ontology),
578            Some(version) => Err(format!(
579                "ontology snapshot uses format_version {version} but this build only supports {TSV_FORMAT_VERSION}",
580            )),
581            None => Err(format!(
582                "ontology snapshot is missing the `# format_version: {TSV_FORMAT_VERSION}` header line",
583            )),
584        }
585    }
586
587    fn insert_term(&mut self, columns: &[&str], line_number: usize) -> Result<(), String> {
588        if columns.len() != 12 {
589            return Err(format!(
590                "term line {line_number} has {} columns",
591                columns.len()
592            ));
593        }
594        let id = columns[1].to_owned();
595        let aliases = split_list(columns[4]);
596        let parents = split_list(columns[5]);
597        let fact = TermFact {
598            iri: columns[2].to_owned(),
599            label: columns[3].to_owned(),
600            parents,
601            namespace: parse_namespace(columns[6])?,
602            role: parse_role(columns[7])?,
603            component_family: parse_component_family(columns[8])?,
604            sequence_family: parse_sequence_family(columns[9])?,
605            table_1_sequence_encoding: parse_bool(columns[10])?,
606            table_2_component_type: parse_bool(columns[11])?,
607        };
608
609        self.aliases.insert(id.clone(), id.clone());
610        self.aliases.insert(fact.iri.clone(), id.clone());
611        for alias in aliases {
612            self.aliases.insert(alias, id.clone());
613        }
614        self.terms.insert(id, fact);
615        Ok(())
616    }
617
618    fn insert_compatibility(&mut self, columns: &[&str], line_number: usize) -> Result<(), String> {
619        if columns.len() != 3 {
620            return Err(format!(
621                "compatibility line {line_number} has {} columns",
622                columns.len()
623            ));
624        }
625        self.compatibilities
626            .insert((columns[1].to_owned(), columns[2].to_owned()));
627        Ok(())
628    }
629
630    fn insert_branch(&mut self, columns: &[&str], line_number: usize) -> Result<(), String> {
631        if columns.len() != 3 {
632            return Err(format!(
633                "branch line {line_number} has {} columns",
634                columns.len()
635            ));
636        }
637        self.branches
638            .insert((columns[1].to_owned(), columns[2].to_owned()));
639        Ok(())
640    }
641
642    fn insert_conflict(&mut self, columns: &[&str], line_number: usize) -> Result<(), String> {
643        if columns.len() != 3 {
644            return Err(format!(
645                "conflict line {line_number} has {} columns",
646                columns.len()
647            ));
648        }
649        self.conflicts.insert(ordered_pair(columns[1], columns[2]));
650        Ok(())
651    }
652
653    fn insert_component_role_term(
654        &mut self,
655        columns: &[&str],
656        line_number: usize,
657    ) -> Result<(), String> {
658        if columns.len() != 2 {
659            return Err(format!(
660                "component role line {line_number} has {} columns",
661                columns.len()
662            ));
663        }
664        self.component_role_terms.insert(columns[1].to_owned());
665        Ok(())
666    }
667
668    fn insert_component_role_compatibility(
669        &mut self,
670        columns: &[&str],
671        line_number: usize,
672    ) -> Result<(), String> {
673        if columns.len() != 3 {
674            return Err(format!(
675                "component role compatibility line {line_number} has {} columns",
676                columns.len()
677            ));
678        }
679        self.component_role_compatibilities
680            .insert((columns[1].to_owned(), columns[2].to_owned()));
681        Ok(())
682    }
683
684    fn insert_participation_compatibility(
685        &mut self,
686        columns: &[&str],
687        line_number: usize,
688    ) -> Result<(), String> {
689        if columns.len() != 3 {
690            return Err(format!(
691                "participation compatibility line {line_number} has {} columns",
692                columns.len()
693            ));
694        }
695        self.participation_compatibilities
696            .insert((columns[1].to_owned(), columns[2].to_owned()));
697        Ok(())
698    }
699}
700
701fn parse_sources(sources: &str) -> Result<Vec<OntologyProvenance>, String> {
702    let mut provenance = Vec::new();
703    for (line_number, line) in sources.lines().enumerate() {
704        if line.trim().is_empty() || line.starts_with('#') {
705            continue;
706        }
707        let columns = line.split('\t').collect::<Vec<_>>();
708        if columns.len() != 8 {
709            return Err(format!(
710                "ontology source line {line_number} has {} columns",
711                columns.len()
712            ));
713        }
714        provenance.push(OntologyProvenance {
715            ontology: columns[0].to_owned(),
716            source_url: columns[1].to_owned(),
717            version: columns[2].to_owned(),
718            license: columns[3].to_owned(),
719            retrieved: columns[4].to_owned(),
720            raw_sha256: columns[5].to_owned(),
721            fact_sha256: columns[6].to_owned(),
722            notes: columns[7].to_owned(),
723        });
724    }
725    Ok(provenance)
726}
727
728fn split_list(value: &str) -> Vec<String> {
729    if value == "-" {
730        return Vec::new();
731    }
732    value.split('|').map(ToOwned::to_owned).collect()
733}
734
735fn parse_bool(value: &str) -> Result<bool, String> {
736    match value {
737        "true" => Ok(true),
738        "false" => Ok(false),
739        _ => Err(format!("invalid boolean `{value}`")),
740    }
741}
742
743fn parse_namespace(value: &str) -> Result<OntologyNamespace, String> {
744    match value {
745        "EDAM" => Ok(OntologyNamespace::Edam),
746        "SBO" => Ok(OntologyNamespace::Sbo),
747        "SO" => Ok(OntologyNamespace::So),
748        "GO" => Ok(OntologyNamespace::Go),
749        "CHEBI" => Ok(OntologyNamespace::Chebi),
750        "CL" => Ok(OntologyNamespace::Cl),
751        "NCIT" => Ok(OntologyNamespace::Ncit),
752        _ => Err(format!("unknown ontology namespace `{value}`")),
753    }
754}
755
756fn parse_role(value: &str) -> Result<TermRole, String> {
757    match value {
758        "sequence_encoding" => Ok(TermRole::SequenceEncoding),
759        "component_type" => Ok(TermRole::ComponentType),
760        "component_type_modifier" => Ok(TermRole::ComponentTypeModifier),
761        "interaction_type" => Ok(TermRole::InteractionType),
762        "participation_role" => Ok(TermRole::ParticipationRole),
763        "feature_role" => Ok(TermRole::FeatureRole),
764        "other" => Ok(TermRole::Other),
765        _ => Err(format!("unknown term role `{value}`")),
766    }
767}
768
769fn parse_component_family(value: &str) -> Result<Option<ComponentTypeFamily>, String> {
770    match value {
771        "-" => Ok(None),
772        "nucleic_acid" => Ok(Some(ComponentTypeFamily::NucleicAcid)),
773        "protein" => Ok(Some(ComponentTypeFamily::Protein)),
774        "simple_chemical" => Ok(Some(ComponentTypeFamily::SimpleChemical)),
775        "complex" => Ok(Some(ComponentTypeFamily::Complex)),
776        "functional" => Ok(Some(ComponentTypeFamily::Functional)),
777        _ => Err(format!("unknown Component type family `{value}`")),
778    }
779}
780
781fn parse_sequence_family(value: &str) -> Result<Option<SequenceEncodingFamily>, String> {
782    match value {
783        "-" => Ok(None),
784        "nucleic_acid" => Ok(Some(SequenceEncodingFamily::NucleicAcid)),
785        "protein" => Ok(Some(SequenceEncodingFamily::Protein)),
786        "simple_chemical" => Ok(Some(SequenceEncodingFamily::SimpleChemical)),
787        "other_textual" => Ok(Some(SequenceEncodingFamily::OtherTextual)),
788        _ => Err(format!("unknown Sequence encoding family `{value}`")),
789    }
790}
791
792/// Returns the compact ontology ID for a supported IRI, PURL, or compact ID.
793pub fn normalize_term_id(value: &str) -> Option<String> {
794    if let Some((prefix, local)) = value.split_once(':')
795        && !value.starts_with("http://")
796        && !value.starts_with("https://")
797    {
798        return Some(format!("{}:{local}", normalize_prefix(prefix)?));
799    }
800
801    if let Some(rest) = value
802        .strip_prefix("https://identifiers.org/")
803        .or_else(|| value.strip_prefix("http://identifiers.org/"))
804    {
805        let (prefix, local) = rest.split_once(':')?;
806        return Some(format!("{}:{local}", normalize_prefix(prefix)?));
807    }
808
809    if let Some(local) = value.strip_prefix("http://edamontology.org/") {
810        return Some(format!("EDAM:{local}"));
811    }
812
813    if let Some(local) = value
814        .strip_prefix("http://biomodels.net/SBO/SBO_")
815        .or_else(|| value.strip_prefix("https://biomodels.net/SBO/SBO_"))
816    {
817        return Some(format!("SBO:{local}"));
818    }
819
820    if let Some(local) = value.strip_prefix("http://purl.obolibrary.org/obo/") {
821        let (prefix, suffix) = local.split_once('_')?;
822        return Some(format!("{}:{suffix}", normalize_prefix(prefix)?));
823    }
824
825    None
826}
827
828fn ordered_pair(left: &str, right: &str) -> (String, String) {
829    if left <= right {
830        (left.to_owned(), right.to_owned())
831    } else {
832        (right.to_owned(), left.to_owned())
833    }
834}
835
836/// A read-only view that layers zero-or-more extension snapshots on top of
837/// the bundled [`Ontology`].
838///
839/// The bundled snapshot is always present; extensions add new terms, aliases,
840/// branch memberships, and compatibility rows without overriding bundled
841/// facts. Construct one through [`OntologyRegistry::bundled_only`] or
842/// [`OntologyRegistry::bundled_with`] and pass it (or its inner [`Ontology`])
843/// to the validator.
844#[derive(Clone, Debug)]
845pub struct OntologyRegistry {
846    inner: Cow<'static, Ontology>,
847}
848
849impl OntologyRegistry {
850    /// Registry containing only the bundled snapshot. Zero allocation.
851    pub fn bundled_only() -> Self {
852        Self {
853            inner: Cow::Borrowed(Ontology::bundled()),
854        }
855    }
856
857    /// Registry containing the bundled snapshot plus the provided extension
858    /// snapshots applied in order. Bundled facts win on conflicts.
859    pub fn bundled_with<I>(extensions: I) -> Self
860    where
861        I: IntoIterator<Item = Ontology>,
862    {
863        let mut iter = extensions.into_iter();
864        let Some(first) = iter.next() else {
865            return Self::bundled_only();
866        };
867        let mut merged = Ontology::bundled().clone();
868        merged.extend_with(first);
869        for ext in iter {
870            merged.extend_with(ext);
871        }
872        Self {
873            inner: Cow::Owned(merged),
874        }
875    }
876
877    /// Adds another extension snapshot on top of this registry.
878    pub fn with_extension(mut self, extension: Ontology) -> Self {
879        let merged = self.inner.to_mut();
880        merged.extend_with(extension);
881        self
882    }
883
884    /// Returns the merged snapshot as an [`Ontology`].
885    pub fn ontology(&self) -> &Ontology {
886        self.inner.as_ref()
887    }
888}
889
890impl Default for OntologyRegistry {
891    fn default() -> Self {
892        Self::bundled_only()
893    }
894}
895
896impl AsRef<Ontology> for OntologyRegistry {
897    fn as_ref(&self) -> &Ontology {
898        self.ontology()
899    }
900}
901
902fn normalize_prefix(prefix: &str) -> Option<&'static str> {
903    match prefix.to_ascii_uppercase().as_str() {
904        "EDAM" => Some("EDAM"),
905        "SBO" => Some("SBO"),
906        "SO" => Some("SO"),
907        "GO" => Some("GO"),
908        "CHEBI" => Some("CHEBI"),
909        "CL" => Some("CL"),
910        "NCIT" => Some("NCIT"),
911        _ => None,
912    }
913}
914
915#[cfg(test)]
916mod tests {
917    use super::*;
918
919    #[test]
920    fn bundled_ontology_loads_core_terms() {
921        let ontology = Ontology::bundled();
922
923        assert!(ontology.contains_term("https://identifiers.org/edam:format_1207"));
924        assert!(ontology.contains_term("https://identifiers.org/SBO:0000251"));
925        assert!(ontology.contains_term("https://identifiers.org/SO:0000987"));
926        assert!(ontology.contains_term("https://identifiers.org/GO:0003700"));
927        assert!(ontology.contains_term("https://identifiers.org/CHEBI:35224"));
928        assert!(ontology.contains_term("https://identifiers.org/CL:0000540"));
929        assert!(!ontology.provenance().is_empty());
930    }
931
932    fn synthetic_extension_tsv() -> &'static str {
933        "# format_version: 1\n# kind\tid\tiri\tlabel\taliases\tparents\tontology\trole\tcomponent_family\tsequence_family\ttable1\ttable2\n\
934         term\tCL:9999999\thttps://identifiers.org/CL:9999999\tlab-only synthetic cell\t-\tCL:0000540\tCL\tcomponent_type\t-\t-\tfalse\tfalse\n\
935         branch\tCL:9999999\tCL:0000000\n"
936    }
937
938    #[test]
939    fn from_tsv_str_rejects_missing_format_version() {
940        let result = Ontology::from_tsv_str(
941            "term\tFOO:1\thttps://example.org/foo\tfoo\t-\t-\tEDAM\tother\t-\t-\tfalse\tfalse\n",
942        );
943        assert!(
944            result.is_err(),
945            "expected missing-header error, got {result:?}"
946        );
947    }
948
949    #[test]
950    fn from_tsv_str_rejects_unknown_format_version() {
951        let bumped = "# format_version: 9999\n# kind\tid\tiri\tlabel\taliases\tparents\tontology\trole\tcomponent_family\tsequence_family\ttable1\ttable2\n\
952             term\tEDAM:format_1915\thttps://identifiers.org/edam:format_1915\tFormat\t-\t-\tEDAM\tother\t-\t-\tfalse\tfalse\n";
953        let err = Ontology::from_tsv_str(bumped).unwrap_err();
954        assert!(
955            err.contains("format_version 9999"),
956            "unexpected error `{err}`"
957        );
958    }
959
960    #[test]
961    fn ontology_registry_layers_extensions_over_bundled() {
962        let extension = Ontology::from_tsv_str(synthetic_extension_tsv()).unwrap();
963        let registry = OntologyRegistry::bundled_with([extension]);
964        let ontology = registry.ontology();
965
966        assert!(ontology.contains_term("CL:9999999"));
967        assert_eq!(ontology.is_cell_type_term("CL:9999999"), Some(true));
968        // Bundled facts still resolve normally.
969        assert_eq!(ontology.is_cell_type_term("CL:0000540"), Some(true));
970    }
971
972    #[test]
973    fn ontology_registry_bundled_only_borrows_static() {
974        // Two registries built without extensions point at the same bundled
975        // snapshot, validating the zero-allocation default path.
976        let a = OntologyRegistry::bundled_only();
977        let b = OntologyRegistry::bundled_only();
978        assert!(std::ptr::eq(a.ontology(), b.ontology()));
979    }
980
981    #[test]
982    fn cell_ontology_terms_resolve_via_branch_root() {
983        let ontology = Ontology::bundled();
984
985        assert_eq!(ontology.is_cell_type_term("CL:0000540"), Some(true));
986        assert_eq!(ontology.is_cell_type_term("CL:0000084"), Some(true));
987        assert_eq!(ontology.is_cell_type_term("CL:0000000"), Some(true));
988        assert_eq!(
989            ontology.is_cell_type_term("http://purl.obolibrary.org/obo/CL_0000540"),
990            Some(true)
991        );
992        assert_eq!(
993            ontology.namespace("CL:0000540"),
994            Some(OntologyNamespace::Cl)
995        );
996        assert_eq!(ontology.is_cell_type_term("SO:0000316"), Some(false));
997        assert_eq!(
998            ontology.is_cell_type_term("https://example.org/custom"),
999            None
1000        );
1001    }
1002
1003    #[test]
1004    fn normalizes_identifiers_org_obo_purls_and_native_edam_iris() {
1005        let ontology = Ontology::bundled();
1006
1007        assert_eq!(
1008            ontology.canonical_id("http://edamontology.org/format_1207"),
1009            Some("EDAM:format_1207".to_owned())
1010        );
1011        assert_eq!(
1012            ontology.canonical_id("http://purl.obolibrary.org/obo/SBO_0000251"),
1013            Some("SBO:0000251".to_owned())
1014        );
1015        assert_eq!(
1016            ontology.canonical_id("https://identifiers.org/SO:0000987"),
1017            Some("SO:0000987".to_owned())
1018        );
1019    }
1020
1021    #[test]
1022    fn branch_queries_follow_parent_links() {
1023        let ontology = Ontology::bundled();
1024
1025        assert!(ontology.is_descendant("EDAM:format_1207", "EDAM:format_2330"));
1026        assert!(ontology.is_in_branch("EDAM:format_3752", "EDAM:format_2330"));
1027        assert!(ontology.is_in_branch("SBO:0000243", "SBO:0000236"));
1028        assert!(ontology.is_in_branch("SBO:0000176", "SBO:0000231"));
1029        assert!(ontology.is_in_branch("SBO:0000010", "SBO:0000003"));
1030        assert!(ontology.is_in_branch("EDAM:format_1207", "EDAM:format_1915"));
1031        assert!(ontology.is_in_branch("SO:0000987", "SO:0000986"));
1032        assert!(ontology.is_in_branch("SO:0000984", "SO:0000983"));
1033        assert!(ontology.is_in_branch("SO:0000167", "SO:0000110"));
1034        assert!(ontology.is_in_branch("GO:0001216", "GO:0003674"));
1035        assert!(ontology.is_in_branch("GO:0003700", "GO:0003674"));
1036        assert!(ontology.is_in_branch("CHEBI:35224", "CHEBI:50906"));
1037        assert!(!ontology.is_in_branch("SO:0000987", "SO:0000983"));
1038        assert!(ontology.is_equivalent_or_descendant("EDAM:format_2330", "EDAM:format_2330"));
1039        assert!(!ontology.is_descendant("EDAM:format_2330", "EDAM:format_2330"));
1040    }
1041
1042    #[test]
1043    fn compatibility_and_conflict_queries_distinguish_unknowns() {
1044        let ontology = Ontology::bundled();
1045
1046        assert_eq!(
1047            ontology.encoding_compatible_with_component_type("EDAM:format_1207", "SBO:0000251"),
1048            Some(true)
1049        );
1050        assert_eq!(
1051            ontology.encoding_compatible_with_component_type("EDAM:format_1208", "SBO:0000251"),
1052            Some(false)
1053        );
1054        assert_eq!(
1055            ontology.encoding_compatible_with_component_type(
1056                "https://example.org/custom",
1057                "SBO:0000251"
1058            ),
1059            None
1060        );
1061        assert_eq!(
1062            ontology.terms_conflict("SBO:0000251", "SBO:0000252"),
1063            Some(true)
1064        );
1065        assert_eq!(
1066            ontology.terms_conflict("SBO:0000169", "SBO:0000170"),
1067            Some(true)
1068        );
1069        assert_eq!(
1070            ontology.terms_conflict("SBO:0000251", "SO:0000987"),
1071            Some(false)
1072        );
1073        assert_eq!(
1074            ontology.terms_conflict("SBO:0000251", "https://example.org/custom"),
1075            None
1076        );
1077        assert_eq!(
1078            ontology.is_component_type_term("https://example.org/custom"),
1079            None
1080        );
1081        assert_eq!(
1082            ontology.is_component_role_term("https://example.org/custom"),
1083            None
1084        );
1085        assert_eq!(
1086            ontology.component_role_compatible_with_component_type(
1087                "https://example.org/custom",
1088                "SBO:0000251"
1089            ),
1090            None
1091        );
1092    }
1093
1094    #[test]
1095    fn role_queries_cover_sbol_feature_interaction_and_participation_terms() {
1096        let ontology = Ontology::bundled();
1097
1098        assert_eq!(ontology.is_feature_role_term("SO:0000167"), Some(true));
1099        assert_eq!(ontology.is_component_role_term("SO:0000167"), Some(true));
1100        assert_eq!(ontology.is_component_role_term("SBO:0000289"), Some(true));
1101        assert_eq!(ontology.is_component_role_term("SBO:0000290"), Some(true));
1102        assert_eq!(
1103            ontology.is_sequence_feature_role_term("SO:0000167"),
1104            Some(true)
1105        );
1106        assert_eq!(ontology.is_feature_role_term("SBO:0000176"), Some(false));
1107        assert_eq!(ontology.is_component_type_term("SBO:0000243"), Some(true));
1108        assert_eq!(ontology.is_component_type_term("SBO:0000290"), Some(true));
1109        assert_eq!(ontology.is_interaction_type_term("SBO:0000176"), Some(true));
1110        assert_eq!(
1111            ontology.is_participation_role_term("SBO:0000010"),
1112            Some(true)
1113        );
1114        assert_eq!(
1115            ontology
1116                .participation_role_compatible_with_interaction_type("SBO:0000010", "SBO:0000176"),
1117            Some(true)
1118        );
1119        assert_eq!(
1120            ontology
1121                .participation_role_compatible_with_interaction_type("SBO:0000459", "SBO:0000169"),
1122            Some(false)
1123        );
1124        assert_eq!(
1125            ontology.component_role_compatible_with_component_type("SO:0000167", "SBO:0000251"),
1126            Some(true)
1127        );
1128        assert_eq!(
1129            ontology.component_role_compatible_with_component_type("GO:0003700", "SBO:0000252"),
1130            Some(true)
1131        );
1132        assert_eq!(
1133            ontology.component_role_compatible_with_component_type("CHEBI:35224", "SBO:0000247"),
1134            Some(true)
1135        );
1136        assert_eq!(
1137            ontology.component_role_compatible_with_component_type("SO:0000167", "SBO:0000252"),
1138            Some(false)
1139        );
1140    }
1141}