Skip to main content

ontoindex_parser/
rdf.rs

1use crate::vocab::{Rdf, Rdfs, OWL};
2use ontoindex_core::{
3    limits::{MAX_FILE_BYTES, MAX_TRIPLES_PER_FILE},
4    Annotation, Axiom, Entity, EntityKind, Import, Namespace, OntologyFormat, ParseStatus,
5    SourceLocation, AXIOM_KIND_SUB_CLASS_OF,
6};
7use oxigraph::io::{RdfFormat, RdfParseError, RdfParser};
8use oxigraph::model::{Quad, Subject, Term};
9use std::collections::{BTreeMap, BTreeSet, HashMap};
10use std::fs;
11use std::path::Path;
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum ParseError {
16    #[error("IO error: {0}")]
17    Io(#[from] std::io::Error),
18
19    #[error("RDF parse error: {0}")]
20    Rdf(String),
21
22    #[error("unsupported format: {0}")]
23    UnsupportedFormat(String),
24
25    #[error("limit exceeded: {0}")]
26    LimitExceeded(String),
27}
28
29pub type Result<T> = std::result::Result<T, ParseError>;
30
31#[derive(Debug, Clone)]
32pub struct ParsedOntology {
33    pub ontology_id: String,
34    pub base_iri: Option<String>,
35    pub imports: Vec<String>,
36    pub namespaces: BTreeMap<String, String>,
37    pub entities: Vec<Entity>,
38    pub annotations: Vec<Annotation>,
39    pub axioms: Vec<Axiom>,
40    pub namespace_rows: Vec<Namespace>,
41    pub import_rows: Vec<Import>,
42    pub parse_status: ParseStatus,
43    pub parse_message: Option<String>,
44    pub parse_error_location: Option<SourceLocation>,
45    pub triple_count: usize,
46    quads: Vec<Quad>,
47}
48
49impl ParsedOntology {
50    /// RDF quads for catalog indexing only — not a stable public API.
51    #[doc(hidden)]
52    pub fn quads(&self) -> &[Quad] {
53        &self.quads
54    }
55}
56
57pub fn parse_ontology_file(
58    path: &Path,
59    format: OntologyFormat,
60    ontology_id: &str,
61    content_hash: &str,
62    modified_time: u64,
63) -> Result<ParsedOntology> {
64    let _ = (content_hash, modified_time);
65    let metadata = fs::metadata(path)?;
66    if metadata.len() > MAX_FILE_BYTES {
67        return Err(ParseError::LimitExceeded(format!(
68            "file exceeds {MAX_FILE_BYTES} bytes: {}",
69            path.display()
70        )));
71    }
72    let content = fs::read(path)?;
73    let source_text = String::from_utf8_lossy(&content).into_owned();
74    parse_ontology_text(path, format, ontology_id, &source_text, &content)
75}
76
77/// Parse ontology source text (used for LSP open buffers and file parsing).
78pub fn parse_ontology_text(
79    path: &Path,
80    format: OntologyFormat,
81    ontology_id: &str,
82    source_text: &str,
83    raw_bytes: &[u8],
84) -> Result<ParsedOntology> {
85    if raw_bytes.len() as u64 > MAX_FILE_BYTES {
86        return Err(ParseError::LimitExceeded(format!(
87            "source exceeds {MAX_FILE_BYTES} bytes: {}",
88            path.display()
89        )));
90    }
91    let rdf_format = to_rdf_format(format, path)?;
92
93    let mut quads = Vec::new();
94    let mut parse_message = None;
95    let mut parse_error_location = None;
96    let mut parse_status = ParseStatus::Ok;
97
98    let parser = RdfParser::from_format(rdf_format);
99    for quad in parser.for_reader(raw_bytes) {
100        match quad {
101            Ok(q) => {
102                if quads.len() >= MAX_TRIPLES_PER_FILE {
103                    return Err(ParseError::LimitExceeded(format!(
104                        "file exceeds {MAX_TRIPLES_PER_FILE} triples: {}",
105                        path.display()
106                    )));
107                }
108                quads.push(q);
109            }
110            Err(e) => {
111                parse_status = ParseStatus::Error;
112                parse_message = Some(format_parse_error(&e));
113                parse_error_location = extract_parse_error_location(&e, source_text);
114                break;
115            }
116        }
117    }
118
119    if parse_status == ParseStatus::Error {
120        return Ok(empty_result(
121            ontology_id,
122            parse_status,
123            parse_message,
124            parse_error_location,
125            BTreeMap::new(),
126        ));
127    }
128
129    let mut namespaces =
130        if format == OntologyFormat::TriG { extract_prefixes(&quads) } else { BTreeMap::new() };
131    namespaces.extend(extract_declared_prefixes(source_text, format));
132    if namespaces.is_empty() {
133        namespaces.insert("".to_string(), default_base_iri(path));
134    }
135
136    let mut builder = OntologyBuilder::new(ontology_id.to_string(), namespaces.clone());
137    for quad in &quads {
138        builder.ingest_quad(quad);
139    }
140    builder.finish(parse_status, parse_message, parse_error_location, source_text, quads)
141}
142
143fn to_rdf_format(format: OntologyFormat, path: &Path) -> Result<RdfFormat> {
144    match format {
145        OntologyFormat::Turtle => Ok(RdfFormat::Turtle),
146        OntologyFormat::RdfXml | OntologyFormat::Owl => Ok(RdfFormat::RdfXml),
147        OntologyFormat::JsonLd => Ok(RdfFormat::JsonLd { profile: Default::default() }),
148        OntologyFormat::NTriples => Ok(RdfFormat::NTriples),
149        OntologyFormat::NQuads => Ok(RdfFormat::NQuads),
150        OntologyFormat::TriG => Ok(RdfFormat::TriG),
151        OntologyFormat::Unknown => Err(ParseError::UnsupportedFormat(path.display().to_string())),
152    }
153}
154
155fn format_parse_error(error: &RdfParseError) -> String {
156    error.to_string()
157}
158
159fn extract_parse_error_location(
160    error: &RdfParseError,
161    _source_text: &str,
162) -> Option<SourceLocation> {
163    let msg = error.to_string();
164    let line = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
165        if w[0].eq_ignore_ascii_case("line") {
166            w[1].trim_end_matches(':').parse().ok()
167        } else {
168            None
169        }
170    });
171    let column = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
172        if w[0].eq_ignore_ascii_case("column") || w[0].eq_ignore_ascii_case("col") {
173            w[1].trim_end_matches(':').parse().ok()
174        } else {
175            None
176        }
177    });
178    if line.is_some() || column.is_some() {
179        Some(SourceLocation { line, column, ..Default::default() })
180    } else {
181        None
182    }
183}
184
185fn default_base_iri(path: &Path) -> String {
186    format!("file://{}", path.display())
187}
188
189fn extract_declared_prefixes(
190    source_text: &str,
191    format: OntologyFormat,
192) -> BTreeMap<String, String> {
193    let mut prefixes = BTreeMap::new();
194
195    if matches!(format, OntologyFormat::Turtle | OntologyFormat::TriG) {
196        for line in source_text.lines() {
197            let trimmed = line.trim();
198            let rest = trimmed
199                .strip_prefix("@prefix ")
200                .or_else(|| trimmed.strip_prefix("@PREFIX "))
201                .or_else(|| trimmed.strip_prefix("PREFIX "));
202            let Some(rest) = rest else {
203                continue;
204            };
205            let Some((prefix_part, iri_part)) = rest.split_once('<') else {
206                continue;
207            };
208            let prefix = prefix_part.trim().trim_end_matches(':');
209            let Some(iri) = iri_part.split('>').next() else {
210                continue;
211            };
212            prefixes.insert(prefix.to_string(), iri.to_string());
213        }
214        return prefixes;
215    }
216
217    if matches!(format, OntologyFormat::RdfXml | OntologyFormat::Owl) {
218        static XMLNS_ATTR: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
219            regex::Regex::new(r#"xmlns(?::([A-Za-z][\w-]*))?="([^"]+)""#).expect("xmlns regex")
220        });
221        for cap in XMLNS_ATTR.captures_iter(source_text) {
222            let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or("");
223            let iri = cap.get(2).map(|m| m.as_str()).unwrap_or("");
224            if !iri.is_empty() {
225                prefixes.insert(prefix.to_string(), iri.to_string());
226            }
227        }
228    }
229
230    prefixes
231}
232
233fn extract_prefixes(quads: &[Quad]) -> BTreeMap<String, String> {
234    let mut prefixes = BTreeMap::new();
235    for quad in quads {
236        if let oxigraph::model::GraphNameRef::NamedNode(graph) = quad.graph_name.as_ref() {
237            let iri = graph.as_str();
238            if let Some((prefix, _)) = iri.rsplit_once('#') {
239                if let Some((p, _)) = prefix.rsplit_once('/') {
240                    prefixes
241                        .entry(short_name_from_iri(p))
242                        .or_insert_with(|| format!("{}/", prefix.trim_end_matches('#')));
243                }
244            }
245        }
246    }
247    prefixes
248}
249
250fn empty_result(
251    ontology_id: &str,
252    parse_status: ParseStatus,
253    parse_message: Option<String>,
254    parse_error_location: Option<SourceLocation>,
255    namespaces: BTreeMap<String, String>,
256) -> ParsedOntology {
257    ParsedOntology {
258        ontology_id: ontology_id.to_string(),
259        base_iri: namespaces.values().next().cloned(),
260        imports: Vec::new(),
261        namespaces: namespaces.clone(),
262        entities: Vec::new(),
263        annotations: Vec::new(),
264        axioms: Vec::new(),
265        namespace_rows: namespaces
266            .into_iter()
267            .map(|(prefix, iri)| Namespace { prefix, iri, ontology_id: ontology_id.to_string() })
268            .collect(),
269        import_rows: Vec::new(),
270        parse_status,
271        parse_message,
272        parse_error_location,
273        triple_count: 0,
274        quads: Vec::new(),
275    }
276}
277
278struct EntityState {
279    kind: EntityKind,
280    labels: Vec<String>,
281    comments: Vec<String>,
282    deprecated: bool,
283    types: BTreeSet<String>,
284}
285
286struct OntologyBuilder {
287    ontology_id: String,
288    namespaces: BTreeMap<String, String>,
289    entities: HashMap<String, EntityState>,
290    annotations: Vec<Annotation>,
291    axioms: Vec<Axiom>,
292    imports: BTreeSet<String>,
293    ontology_iris: BTreeSet<String>,
294    triple_count: usize,
295    axiom_counter: usize,
296}
297
298impl OntologyBuilder {
299    fn new(ontology_id: String, namespaces: BTreeMap<String, String>) -> Self {
300        Self {
301            ontology_id,
302            namespaces,
303            entities: HashMap::new(),
304            annotations: Vec::new(),
305            axioms: Vec::new(),
306            imports: BTreeSet::new(),
307            ontology_iris: BTreeSet::new(),
308            triple_count: 0,
309            axiom_counter: 0,
310        }
311    }
312
313    fn ingest_quad(&mut self, quad: &Quad) {
314        self.triple_count += 1;
315        let subject = subject_to_string(&quad.subject);
316        let predicate = quad.predicate.as_str().to_string();
317        let object = term_to_string(&quad.object);
318
319        if quad.predicate == Rdf::type_() {
320            if let Term::NamedNode(node) = &quad.object {
321                let type_iri = node.as_str();
322                if type_iri == OWL::ontology().as_str() {
323                    self.ontology_iris.insert(subject.clone());
324                }
325                let kind = entity_kind_for_type(type_iri);
326                if kind != EntityKind::Other {
327                    let entry =
328                        self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
329                            kind,
330                            labels: Vec::new(),
331                            comments: Vec::new(),
332                            deprecated: false,
333                            types: BTreeSet::new(),
334                        });
335                    entry.types.insert(type_iri.to_string());
336                    if entry.kind == EntityKind::Other
337                        || kind_priority(kind) > kind_priority(entry.kind)
338                    {
339                        entry.kind = kind;
340                    }
341                }
342            }
343            return;
344        }
345
346        if quad.predicate == OWL::imports() {
347            self.imports.insert(object.clone());
348            return;
349        }
350
351        if quad.predicate == Rdfs::label() {
352            if let Some(entity) = self.entities.get_mut(&subject) {
353                entity.labels.push(object.clone());
354            } else {
355                self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
356                    kind: EntityKind::Other,
357                    labels: vec![object.clone()],
358                    comments: Vec::new(),
359                    deprecated: false,
360                    types: BTreeSet::new(),
361                });
362            }
363            self.annotations.push(Annotation {
364                subject: subject.clone(),
365                predicate: predicate.clone(),
366                object: object.clone(),
367                ontology_id: self.ontology_id.clone(),
368                source_location: SourceLocation::default(),
369            });
370            return;
371        }
372
373        if quad.predicate == Rdfs::comment() {
374            if let Some(entity) = self.entities.get_mut(&subject) {
375                entity.comments.push(object.clone());
376            } else {
377                self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
378                    kind: EntityKind::Other,
379                    labels: Vec::new(),
380                    comments: vec![object.clone()],
381                    deprecated: false,
382                    types: BTreeSet::new(),
383                });
384            }
385            self.annotations.push(Annotation {
386                subject: subject.clone(),
387                predicate: predicate.clone(),
388                object: object.clone(),
389                ontology_id: self.ontology_id.clone(),
390                source_location: SourceLocation::default(),
391            });
392            return;
393        }
394
395        if quad.predicate == OWL::deprecated() {
396            let entry = self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
397                kind: EntityKind::Other,
398                labels: Vec::new(),
399                comments: Vec::new(),
400                deprecated: false,
401                types: BTreeSet::new(),
402            });
403            entry.deprecated = ontoindex_core::parse_boolean_literal(&object).unwrap_or(false);
404            return;
405        }
406
407        if quad.predicate == Rdfs::sub_class_of() {
408            self.axiom_counter += 1;
409            self.axioms.push(Axiom {
410                id: format!("{}#axiom-{}", self.ontology_id, self.axiom_counter),
411                ontology_id: self.ontology_id.clone(),
412                subject: subject.clone(),
413                predicate: predicate.clone(),
414                object: object.clone(),
415                axiom_kind: AXIOM_KIND_SUB_CLASS_OF.to_string(),
416                source_location: SourceLocation::default(),
417            });
418        }
419    }
420
421    fn finish(
422        self,
423        parse_status: ParseStatus,
424        parse_message: Option<String>,
425        parse_error_location: Option<SourceLocation>,
426        source_text: &str,
427        quads: Vec<Quad>,
428    ) -> Result<ParsedOntology> {
429        let base_iri = self
430            .ontology_iris
431            .iter()
432            .next()
433            .cloned()
434            .or_else(|| self.namespaces.get("").cloned())
435            .or_else(|| self.namespaces.values().next().cloned());
436
437        let ontology_id = if let Some(iri) = self.ontology_iris.iter().next() {
438            iri.clone()
439        } else {
440            self.ontology_id.clone()
441        };
442
443        let mut entities = Vec::new();
444        for (iri, state) in &self.entities {
445            if state.kind == EntityKind::Other {
446                continue;
447            }
448            let short_name = short_name_from_iri(iri);
449            entities.push(Entity {
450                iri: iri.clone(),
451                short_name: short_name.clone(),
452                kind: state.kind,
453                ontology_id: ontology_id.clone(),
454                source_location: find_entity_source_location(
455                    source_text,
456                    iri,
457                    &short_name,
458                    &self.namespaces,
459                ),
460                labels: state.labels.clone(),
461                comments: state.comments.clone(),
462                deprecated: state.deprecated,
463            });
464        }
465        entities.sort_by(|a, b| a.iri.cmp(&b.iri));
466
467        let namespace_rows = self
468            .namespaces
469            .iter()
470            .map(|(prefix, iri)| Namespace {
471                prefix: prefix.clone(),
472                iri: iri.clone(),
473                ontology_id: ontology_id.clone(),
474            })
475            .collect();
476
477        let import_rows = self
478            .imports
479            .iter()
480            .map(|import_iri| Import {
481                ontology_id: ontology_id.clone(),
482                import_iri: import_iri.clone(),
483            })
484            .collect();
485
486        let mut annotations = self.annotations;
487        for ann in &mut annotations {
488            ann.ontology_id = ontology_id.clone();
489        }
490        let mut axioms = self.axioms;
491        for axiom in &mut axioms {
492            axiom.ontology_id = ontology_id.clone();
493        }
494
495        Ok(ParsedOntology {
496            ontology_id,
497            base_iri,
498            imports: self.imports.into_iter().collect(),
499            namespaces: self.namespaces.clone(),
500            entities,
501            annotations,
502            axioms,
503            namespace_rows,
504            import_rows,
505            parse_status,
506            parse_message,
507            parse_error_location,
508            triple_count: self.triple_count,
509            quads,
510        })
511    }
512}
513
514fn entity_kind_for_type(type_iri: &str) -> EntityKind {
515    match type_iri {
516        t if t == OWL::class().as_str() || t == Rdfs::class().as_str() => EntityKind::Class,
517        t if t == OWL::object_property().as_str() => EntityKind::ObjectProperty,
518        t if t == OWL::datatype_property().as_str() => EntityKind::DataProperty,
519        t if t == OWL::annotation_property().as_str() => EntityKind::AnnotationProperty,
520        t if t == OWL::named_individual().as_str() => EntityKind::Individual,
521        t if t == OWL::ontology().as_str() => EntityKind::Ontology,
522        _ => EntityKind::Other,
523    }
524}
525
526fn kind_priority(kind: EntityKind) -> u8 {
527    match kind {
528        EntityKind::Class => 5,
529        EntityKind::ObjectProperty => 5,
530        EntityKind::DataProperty => 5,
531        EntityKind::AnnotationProperty => 5,
532        EntityKind::Individual => 4,
533        EntityKind::Ontology => 3,
534        EntityKind::Other => 0,
535    }
536}
537
538fn subject_to_string(subject: &Subject) -> String {
539    match subject {
540        Subject::NamedNode(node) => node.as_str().to_string(),
541        Subject::BlankNode(node) => format!("_:{}", node.as_str()),
542        #[allow(unreachable_patterns)]
543        _ => subject.to_string(),
544    }
545}
546
547fn term_to_string(term: &Term) -> String {
548    match term {
549        Term::NamedNode(node) => node.as_str().to_string(),
550        Term::BlankNode(node) => format!("_:{}", node.as_str()),
551        Term::Literal(lit) => lit.to_string(),
552        #[allow(unreachable_patterns)]
553        _ => term.to_string(),
554    }
555}
556
557fn find_entity_source_location(
558    source_text: &str,
559    iri: &str,
560    short_name: &str,
561    namespaces: &BTreeMap<String, String>,
562) -> SourceLocation {
563    let mut needles = vec![iri.to_string(), format!("<{iri}>"), format!(":{short_name}")];
564    for (prefix, ns) in namespaces {
565        if iri.starts_with(ns) && !prefix.is_empty() {
566            needles.push(format!("{prefix}:{short_name}"));
567        }
568    }
569    for line in source_text.lines() {
570        let trimmed = line.trim();
571        if !(trimmed.starts_with("@prefix")
572            || trimmed.starts_with("@PREFIX")
573            || trimmed.starts_with("PREFIX "))
574        {
575            continue;
576        }
577        if let Some(colon) = trimmed.find(':') {
578            let prefix = trimmed["@prefix ".len()..colon].trim();
579            let prefix = prefix.trim_start_matches('@');
580            if let (Some(start), Some(end)) = (line.find('<'), line.find('>')) {
581                let ns = &line[start + 1..end];
582                if iri.starts_with(ns) && !prefix.is_empty() {
583                    needles.push(format!("{prefix}:{short_name}"));
584                }
585            }
586        }
587    }
588
589    for (line_idx, line) in source_text.lines().enumerate() {
590        for needle in &needles {
591            if let Some(col) = line.find(needle) {
592                return SourceLocation {
593                    line: Some((line_idx + 1) as u64),
594                    column: Some(col as u64),
595                    start_byte: None,
596                    end_byte: None,
597                };
598            }
599        }
600    }
601
602    SourceLocation::default()
603}
604
605fn short_name_from_iri(iri: &str) -> String {
606    if let Some((_, name)) = iri.rsplit_once('#') {
607        return name.to_string();
608    }
609    if let Some((_, name)) = iri.rsplit_once('/') {
610        return name.to_string();
611    }
612    iri.to_string()
613}
614
615#[cfg(test)]
616mod tests {
617    use super::*;
618    use std::io::Write;
619
620    #[test]
621    fn rejects_oversized_source_text() {
622        let dir = tempfile::tempdir().unwrap();
623        let path = dir.path().join("huge.ttl");
624        let oversized = "x".repeat((MAX_FILE_BYTES + 1) as usize);
625        let err = parse_ontology_text(
626            &path,
627            OntologyFormat::Turtle,
628            "doc-1",
629            &oversized,
630            oversized.as_bytes(),
631        )
632        .unwrap_err();
633        assert!(matches!(err, ParseError::LimitExceeded(_)));
634    }
635
636    #[test]
637    fn parses_simple_turtle_ontology() {
638        let dir = tempfile::tempdir().unwrap();
639        let path = dir.path().join("test.ttl");
640        let mut f = fs::File::create(&path).unwrap();
641        writeln!(
642            f,
643            r#"@prefix ex: <http://example.org/test#> .
644@prefix owl: <http://www.w3.org/2002/07/owl#> .
645@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
646
647<http://example.org/test> a owl:Ontology .
648
649ex:Person a owl:Class ;
650    rdfs:label "Person" ;
651    rdfs:comment "A human being" .
652
653ex:knows a owl:ObjectProperty ;
654    rdfs:label "knows" .
655"#
656        )
657        .unwrap();
658
659        let parsed =
660            parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
661
662        assert_eq!(parsed.parse_status, ParseStatus::Ok);
663
664        let person = parsed
665            .entities
666            .iter()
667            .find(|e| e.iri == "http://example.org/test#Person")
668            .expect("Person entity");
669        assert_eq!(person.kind, EntityKind::Class);
670        assert_eq!(person.labels, vec!["\"Person\"".to_string()]);
671        assert!(person.source_location.line.is_some());
672
673        let knows = parsed
674            .entities
675            .iter()
676            .find(|e| e.iri == "http://example.org/test#knows")
677            .expect("knows property");
678        assert_eq!(knows.kind, EntityKind::ObjectProperty);
679        assert_eq!(knows.labels, vec!["\"knows\"".to_string()]);
680    }
681
682    #[test]
683    fn extracts_turtle_prefix_declarations() {
684        let dir = tempfile::tempdir().unwrap();
685        let path = dir.path().join("test.ttl");
686        let mut f = fs::File::create(&path).unwrap();
687        writeln!(
688            f,
689            r#"@prefix ex: <http://example.org/test#> .
690@prefix owl: <http://www.w3.org/2002/07/owl#> .
691
692<http://example.org/test> a owl:Ontology .
693ex:Person a owl:Class .
694"#
695        )
696        .unwrap();
697
698        let parsed =
699            parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
700
701        assert_eq!(parsed.base_iri.as_deref(), Some("http://example.org/test"));
702        assert_eq!(
703            parsed.namespaces.get("ex").map(String::as_str),
704            Some("http://example.org/test#")
705        );
706    }
707}