Skip to main content

ontoindex_parser/
rdf.rs

1use crate::vocab::{Rdf, Rdfs, OWL};
2use ontoindex_core::{
3    limits::{MAX_FILE_BYTES, MAX_TRIPLES_PER_FILE},
4    Annotation, Axiom, Entity, EntityKind, Import, Namespace, OntologyFormat, ParseStatus,
5    SourceLocation, AXIOM_KIND_SUB_CLASS_OF,
6};
7use oxigraph::io::{RdfFormat, RdfParseError, RdfParser};
8use oxigraph::model::{Quad, Subject, Term};
9use std::collections::{BTreeMap, BTreeSet, HashMap};
10use std::fs;
11use std::path::Path;
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum ParseError {
16    #[error("IO error: {0}")]
17    Io(#[from] std::io::Error),
18
19    #[error("RDF parse error: {0}")]
20    Rdf(String),
21
22    #[error("unsupported format: {0}")]
23    UnsupportedFormat(String),
24
25    #[error("limit exceeded: {0}")]
26    LimitExceeded(String),
27}
28
29pub type Result<T> = std::result::Result<T, ParseError>;
30
31#[derive(Debug, Clone)]
32pub struct ParsedOntology {
33    pub ontology_id: String,
34    pub base_iri: Option<String>,
35    pub imports: Vec<String>,
36    pub namespaces: BTreeMap<String, String>,
37    pub entities: Vec<Entity>,
38    pub annotations: Vec<Annotation>,
39    pub axioms: Vec<Axiom>,
40    pub namespace_rows: Vec<Namespace>,
41    pub import_rows: Vec<Import>,
42    pub parse_status: ParseStatus,
43    pub parse_message: Option<String>,
44    pub parse_error_location: Option<SourceLocation>,
45    pub triple_count: usize,
46    quads: Vec<Quad>,
47}
48
49impl ParsedOntology {
50    /// RDF quads for catalog indexing only — not a stable public API.
51    #[doc(hidden)]
52    pub fn quads(&self) -> &[Quad] {
53        &self.quads
54    }
55}
56
57pub fn parse_ontology_file(
58    path: &Path,
59    format: OntologyFormat,
60    ontology_id: &str,
61    content_hash: &str,
62    modified_time: u64,
63) -> Result<ParsedOntology> {
64    let _ = (content_hash, modified_time);
65    let metadata = fs::metadata(path)?;
66    if metadata.len() > MAX_FILE_BYTES {
67        return Err(ParseError::LimitExceeded(format!(
68            "file exceeds {MAX_FILE_BYTES} bytes: {}",
69            path.display()
70        )));
71    }
72    let content = fs::read(path)?;
73    let source_text = String::from_utf8_lossy(&content).into_owned();
74    parse_ontology_text(path, format, ontology_id, &source_text, &content)
75}
76
77/// Parse ontology source text (used for LSP open buffers and file parsing).
78pub fn parse_ontology_text(
79    path: &Path,
80    format: OntologyFormat,
81    ontology_id: &str,
82    source_text: &str,
83    raw_bytes: &[u8],
84) -> Result<ParsedOntology> {
85    if raw_bytes.len() as u64 > MAX_FILE_BYTES {
86        return Err(ParseError::LimitExceeded(format!(
87            "source exceeds {MAX_FILE_BYTES} bytes: {}",
88            path.display()
89        )));
90    }
91    let rdf_format = to_rdf_format(format, path)?;
92
93    let mut quads = Vec::new();
94    let mut parse_message = None;
95    let mut parse_error_location = None;
96    let mut parse_status = ParseStatus::Ok;
97
98    let parser = RdfParser::from_format(rdf_format);
99    for quad in parser.for_reader(raw_bytes) {
100        match quad {
101            Ok(q) => {
102                if quads.len() >= MAX_TRIPLES_PER_FILE {
103                    return Err(ParseError::LimitExceeded(format!(
104                        "file exceeds {MAX_TRIPLES_PER_FILE} triples: {}",
105                        path.display()
106                    )));
107                }
108                quads.push(q);
109            }
110            Err(e) => {
111                parse_status = ParseStatus::Error;
112                parse_message = Some(format_parse_error(&e));
113                parse_error_location = extract_parse_error_location(&e, source_text);
114                break;
115            }
116        }
117    }
118
119    if parse_status == ParseStatus::Error {
120        return Ok(empty_result(
121            ontology_id,
122            parse_status,
123            parse_message,
124            parse_error_location,
125            BTreeMap::new(),
126        ));
127    }
128
129    let mut namespaces =
130        if format == OntologyFormat::TriG { extract_prefixes(&quads) } else { BTreeMap::new() };
131    namespaces.extend(extract_declared_prefixes(source_text, format));
132    if namespaces.is_empty() {
133        namespaces.insert("".to_string(), default_base_iri(path));
134    }
135
136    let mut builder = OntologyBuilder::new(ontology_id.to_string(), namespaces.clone());
137    for quad in &quads {
138        builder.ingest_quad(quad);
139    }
140    builder.finish(parse_status, parse_message, parse_error_location, source_text, quads)
141}
142
143fn to_rdf_format(format: OntologyFormat, path: &Path) -> Result<RdfFormat> {
144    match format {
145        OntologyFormat::Turtle => Ok(RdfFormat::Turtle),
146        OntologyFormat::RdfXml | OntologyFormat::Owl => Ok(RdfFormat::RdfXml),
147        OntologyFormat::JsonLd => Ok(RdfFormat::JsonLd { profile: Default::default() }),
148        OntologyFormat::NTriples => Ok(RdfFormat::NTriples),
149        OntologyFormat::NQuads => Ok(RdfFormat::NQuads),
150        OntologyFormat::TriG => Ok(RdfFormat::TriG),
151        OntologyFormat::Unknown => Err(ParseError::UnsupportedFormat(path.display().to_string())),
152    }
153}
154
155fn format_parse_error(error: &RdfParseError) -> String {
156    error.to_string()
157}
158
159fn extract_parse_error_location(
160    error: &RdfParseError,
161    _source_text: &str,
162) -> Option<SourceLocation> {
163    let msg = error.to_string();
164    let line = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
165        if w[0].eq_ignore_ascii_case("line") {
166            w[1].trim_end_matches(':').parse().ok()
167        } else {
168            None
169        }
170    });
171    let column = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
172        if w[0].eq_ignore_ascii_case("column") || w[0].eq_ignore_ascii_case("col") {
173            w[1].trim_end_matches(':').parse().ok()
174        } else {
175            None
176        }
177    });
178    if line.is_some() || column.is_some() {
179        Some(SourceLocation { line, column, ..Default::default() })
180    } else {
181        None
182    }
183}
184
185fn default_base_iri(path: &Path) -> String {
186    format!("file://{}", path.display())
187}
188
189fn extract_declared_prefixes(
190    source_text: &str,
191    format: OntologyFormat,
192) -> BTreeMap<String, String> {
193    let mut prefixes = BTreeMap::new();
194
195    if matches!(format, OntologyFormat::Turtle | OntologyFormat::TriG) {
196        for line in source_text.lines() {
197            let trimmed = line.trim();
198            let rest = trimmed
199                .strip_prefix("@prefix ")
200                .or_else(|| trimmed.strip_prefix("@PREFIX "))
201                .or_else(|| trimmed.strip_prefix("PREFIX "));
202            let Some(rest) = rest else {
203                continue;
204            };
205            let Some((prefix_part, iri_part)) = rest.split_once('<') else {
206                continue;
207            };
208            let prefix = prefix_part.trim().trim_end_matches(':');
209            let Some(iri) = iri_part.split('>').next() else {
210                continue;
211            };
212            prefixes.insert(prefix.to_string(), iri.to_string());
213        }
214        return prefixes;
215    }
216
217    if matches!(format, OntologyFormat::RdfXml | OntologyFormat::Owl) {
218        static XMLNS_ATTR: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
219            regex::Regex::new(r#"xmlns(?::([A-Za-z][\w-]*))?="([^"]+)""#).expect("xmlns regex")
220        });
221        for cap in XMLNS_ATTR.captures_iter(source_text) {
222            let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or("");
223            let iri = cap.get(2).map(|m| m.as_str()).unwrap_or("");
224            if !iri.is_empty() {
225                prefixes.insert(prefix.to_string(), iri.to_string());
226            }
227        }
228    }
229
230    prefixes
231}
232
233fn extract_prefixes(quads: &[Quad]) -> BTreeMap<String, String> {
234    let mut prefixes = BTreeMap::new();
235    for quad in quads {
236        if let oxigraph::model::GraphNameRef::NamedNode(graph) = quad.graph_name.as_ref() {
237            let iri = graph.as_str();
238            if let Some((prefix, _)) = iri.rsplit_once('#') {
239                if let Some((p, _)) = prefix.rsplit_once('/') {
240                    prefixes
241                        .entry(short_name_from_iri(p))
242                        .or_insert_with(|| format!("{}/", prefix.trim_end_matches('#')));
243                }
244            }
245        }
246    }
247    prefixes
248}
249
250fn empty_result(
251    ontology_id: &str,
252    parse_status: ParseStatus,
253    parse_message: Option<String>,
254    parse_error_location: Option<SourceLocation>,
255    namespaces: BTreeMap<String, String>,
256) -> ParsedOntology {
257    ParsedOntology {
258        ontology_id: ontology_id.to_string(),
259        base_iri: namespaces.values().next().cloned(),
260        imports: Vec::new(),
261        namespaces: namespaces.clone(),
262        entities: Vec::new(),
263        annotations: Vec::new(),
264        axioms: Vec::new(),
265        namespace_rows: namespaces
266            .into_iter()
267            .map(|(prefix, iri)| Namespace { prefix, iri, ontology_id: ontology_id.to_string() })
268            .collect(),
269        import_rows: Vec::new(),
270        parse_status,
271        parse_message,
272        parse_error_location,
273        triple_count: 0,
274        quads: Vec::new(),
275    }
276}
277
278struct EntityState {
279    kind: EntityKind,
280    labels: Vec<String>,
281    comments: Vec<String>,
282    deprecated: bool,
283    types: BTreeSet<String>,
284}
285
286struct OntologyBuilder {
287    ontology_id: String,
288    namespaces: BTreeMap<String, String>,
289    entities: HashMap<String, EntityState>,
290    annotations: Vec<Annotation>,
291    axioms: Vec<Axiom>,
292    imports: BTreeSet<String>,
293    ontology_iris: BTreeSet<String>,
294    triple_count: usize,
295    axiom_counter: usize,
296}
297
298impl OntologyBuilder {
299    fn new(ontology_id: String, namespaces: BTreeMap<String, String>) -> Self {
300        Self {
301            ontology_id,
302            namespaces,
303            entities: HashMap::new(),
304            annotations: Vec::new(),
305            axioms: Vec::new(),
306            imports: BTreeSet::new(),
307            ontology_iris: BTreeSet::new(),
308            triple_count: 0,
309            axiom_counter: 0,
310        }
311    }
312
313    fn ingest_quad(&mut self, quad: &Quad) {
314        self.triple_count += 1;
315        let subject = subject_to_string(&quad.subject);
316        let predicate = quad.predicate.as_str().to_string();
317        let object = term_to_string(&quad.object);
318
319        if quad.predicate == Rdf::type_() {
320            if let Term::NamedNode(node) = &quad.object {
321                let type_iri = node.as_str();
322                if type_iri == OWL::ontology().as_str() {
323                    self.ontology_iris.insert(subject.clone());
324                }
325                let kind = entity_kind_for_type(type_iri);
326                if kind != EntityKind::Other {
327                    let entry =
328                        self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
329                            kind,
330                            labels: Vec::new(),
331                            comments: Vec::new(),
332                            deprecated: false,
333                            types: BTreeSet::new(),
334                        });
335                    entry.types.insert(type_iri.to_string());
336                    if entry.kind == EntityKind::Other
337                        || kind_priority(kind) > kind_priority(entry.kind)
338                    {
339                        entry.kind = kind;
340                    }
341                }
342            }
343            return;
344        }
345
346        if quad.predicate == OWL::imports() {
347            self.imports.insert(object.clone());
348            return;
349        }
350
351        if quad.predicate == Rdfs::label() {
352            if let Some(entity) = self.entities.get_mut(&subject) {
353                entity.labels.push(object.clone());
354            } else {
355                self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
356                    kind: EntityKind::Other,
357                    labels: vec![object.clone()],
358                    comments: Vec::new(),
359                    deprecated: false,
360                    types: BTreeSet::new(),
361                });
362            }
363            self.annotations.push(Annotation {
364                subject: subject.clone(),
365                predicate: predicate.clone(),
366                object: object.clone(),
367                ontology_id: self.ontology_id.clone(),
368                source_location: SourceLocation::default(),
369            });
370            return;
371        }
372
373        if quad.predicate == Rdfs::comment() {
374            if let Some(entity) = self.entities.get_mut(&subject) {
375                entity.comments.push(object.clone());
376            } else {
377                self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
378                    kind: EntityKind::Other,
379                    labels: Vec::new(),
380                    comments: vec![object.clone()],
381                    deprecated: false,
382                    types: BTreeSet::new(),
383                });
384            }
385            self.annotations.push(Annotation {
386                subject: subject.clone(),
387                predicate: predicate.clone(),
388                object: object.clone(),
389                ontology_id: self.ontology_id.clone(),
390                source_location: SourceLocation::default(),
391            });
392            return;
393        }
394
395        if quad.predicate == OWL::deprecated() {
396            let entry = self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
397                kind: EntityKind::Other,
398                labels: Vec::new(),
399                comments: Vec::new(),
400                deprecated: false,
401                types: BTreeSet::new(),
402            });
403            entry.deprecated = object == "true" || object.contains("true");
404            return;
405        }
406
407        if quad.predicate == Rdfs::sub_class_of() {
408            self.axiom_counter += 1;
409            self.axioms.push(Axiom {
410                id: format!("{}#axiom-{}", self.ontology_id, self.axiom_counter),
411                ontology_id: self.ontology_id.clone(),
412                subject: subject.clone(),
413                predicate: predicate.clone(),
414                object: object.clone(),
415                axiom_kind: AXIOM_KIND_SUB_CLASS_OF.to_string(),
416                source_location: SourceLocation::default(),
417            });
418        }
419    }
420
421    fn finish(
422        self,
423        parse_status: ParseStatus,
424        parse_message: Option<String>,
425        parse_error_location: Option<SourceLocation>,
426        source_text: &str,
427        quads: Vec<Quad>,
428    ) -> Result<ParsedOntology> {
429        let base_iri = self
430            .ontology_iris
431            .iter()
432            .next()
433            .cloned()
434            .or_else(|| self.namespaces.get("").cloned())
435            .or_else(|| self.namespaces.values().next().cloned());
436
437        let ontology_id = if let Some(iri) = self.ontology_iris.iter().next() {
438            iri.clone()
439        } else {
440            self.ontology_id.clone()
441        };
442
443        let mut entities = Vec::new();
444        for (iri, state) in &self.entities {
445            if state.kind == EntityKind::Other {
446                continue;
447            }
448            let short_name = short_name_from_iri(iri);
449            entities.push(Entity {
450                iri: iri.clone(),
451                short_name: short_name.clone(),
452                kind: state.kind,
453                ontology_id: ontology_id.clone(),
454                source_location: find_entity_source_location(
455                    source_text,
456                    iri,
457                    &short_name,
458                    &self.namespaces,
459                ),
460                labels: state.labels.clone(),
461                comments: state.comments.clone(),
462                deprecated: state.deprecated,
463            });
464        }
465        entities.sort_by(|a, b| a.iri.cmp(&b.iri));
466
467        let namespace_rows = self
468            .namespaces
469            .iter()
470            .map(|(prefix, iri)| Namespace {
471                prefix: prefix.clone(),
472                iri: iri.clone(),
473                ontology_id: ontology_id.clone(),
474            })
475            .collect();
476
477        let import_rows = self
478            .imports
479            .iter()
480            .map(|import_iri| Import {
481                ontology_id: ontology_id.clone(),
482                import_iri: import_iri.clone(),
483            })
484            .collect();
485
486        Ok(ParsedOntology {
487            ontology_id,
488            base_iri,
489            imports: self.imports.into_iter().collect(),
490            namespaces: self.namespaces.clone(),
491            entities,
492            annotations: self.annotations,
493            axioms: self.axioms,
494            namespace_rows,
495            import_rows,
496            parse_status,
497            parse_message,
498            parse_error_location,
499            triple_count: self.triple_count,
500            quads,
501        })
502    }
503}
504
505fn entity_kind_for_type(type_iri: &str) -> EntityKind {
506    match type_iri {
507        t if t == OWL::class().as_str() || t == Rdfs::class().as_str() => EntityKind::Class,
508        t if t == OWL::object_property().as_str() => EntityKind::ObjectProperty,
509        t if t == OWL::datatype_property().as_str() => EntityKind::DataProperty,
510        t if t == OWL::annotation_property().as_str() => EntityKind::AnnotationProperty,
511        t if t == OWL::named_individual().as_str() => EntityKind::Individual,
512        t if t == OWL::ontology().as_str() => EntityKind::Ontology,
513        _ => EntityKind::Other,
514    }
515}
516
517fn kind_priority(kind: EntityKind) -> u8 {
518    match kind {
519        EntityKind::Class => 5,
520        EntityKind::ObjectProperty => 5,
521        EntityKind::DataProperty => 5,
522        EntityKind::AnnotationProperty => 5,
523        EntityKind::Individual => 4,
524        EntityKind::Ontology => 3,
525        EntityKind::Other => 0,
526    }
527}
528
529fn subject_to_string(subject: &Subject) -> String {
530    match subject {
531        Subject::NamedNode(node) => node.as_str().to_string(),
532        Subject::BlankNode(node) => format!("_:{}", node.as_str()),
533        #[allow(unreachable_patterns)]
534        _ => subject.to_string(),
535    }
536}
537
538fn term_to_string(term: &Term) -> String {
539    match term {
540        Term::NamedNode(node) => node.as_str().to_string(),
541        Term::BlankNode(node) => format!("_:{}", node.as_str()),
542        Term::Literal(lit) => lit.to_string(),
543        #[allow(unreachable_patterns)]
544        _ => term.to_string(),
545    }
546}
547
548fn find_entity_source_location(
549    source_text: &str,
550    iri: &str,
551    short_name: &str,
552    namespaces: &BTreeMap<String, String>,
553) -> SourceLocation {
554    let mut needles = vec![iri.to_string(), format!("<{iri}>"), format!(":{short_name}")];
555    for (prefix, ns) in namespaces {
556        if iri.starts_with(ns) && !prefix.is_empty() {
557            needles.push(format!("{prefix}:{short_name}"));
558        }
559    }
560    for line in source_text.lines() {
561        let trimmed = line.trim();
562        if !(trimmed.starts_with("@prefix")
563            || trimmed.starts_with("@PREFIX")
564            || trimmed.starts_with("PREFIX "))
565        {
566            continue;
567        }
568        if let Some(colon) = trimmed.find(':') {
569            let prefix = trimmed["@prefix ".len()..colon].trim();
570            let prefix = prefix.trim_start_matches('@');
571            if let (Some(start), Some(end)) = (line.find('<'), line.find('>')) {
572                let ns = &line[start + 1..end];
573                if iri.starts_with(ns) && !prefix.is_empty() {
574                    needles.push(format!("{prefix}:{short_name}"));
575                }
576            }
577        }
578    }
579
580    for (line_idx, line) in source_text.lines().enumerate() {
581        for needle in &needles {
582            if let Some(col) = line.find(needle) {
583                return SourceLocation {
584                    line: Some((line_idx + 1) as u64),
585                    column: Some(col as u64),
586                    start_byte: None,
587                    end_byte: None,
588                };
589            }
590        }
591    }
592
593    SourceLocation::default()
594}
595
596fn short_name_from_iri(iri: &str) -> String {
597    if let Some((_, name)) = iri.rsplit_once('#') {
598        return name.to_string();
599    }
600    if let Some((_, name)) = iri.rsplit_once('/') {
601        return name.to_string();
602    }
603    iri.to_string()
604}
605
606#[cfg(test)]
607mod tests {
608    use super::*;
609    use std::io::Write;
610
611    #[test]
612    fn rejects_oversized_source_text() {
613        let dir = tempfile::tempdir().unwrap();
614        let path = dir.path().join("huge.ttl");
615        let oversized = "x".repeat((MAX_FILE_BYTES + 1) as usize);
616        let err = parse_ontology_text(
617            &path,
618            OntologyFormat::Turtle,
619            "doc-1",
620            &oversized,
621            oversized.as_bytes(),
622        )
623        .unwrap_err();
624        assert!(matches!(err, ParseError::LimitExceeded(_)));
625    }
626
627    #[test]
628    fn parses_simple_turtle_ontology() {
629        let dir = tempfile::tempdir().unwrap();
630        let path = dir.path().join("test.ttl");
631        let mut f = fs::File::create(&path).unwrap();
632        writeln!(
633            f,
634            r#"@prefix ex: <http://example.org/test#> .
635@prefix owl: <http://www.w3.org/2002/07/owl#> .
636@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
637
638<http://example.org/test> a owl:Ontology .
639
640ex:Person a owl:Class ;
641    rdfs:label "Person" ;
642    rdfs:comment "A human being" .
643
644ex:knows a owl:ObjectProperty ;
645    rdfs:label "knows" .
646"#
647        )
648        .unwrap();
649
650        let parsed =
651            parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
652
653        assert_eq!(parsed.parse_status, ParseStatus::Ok);
654
655        let person = parsed
656            .entities
657            .iter()
658            .find(|e| e.iri == "http://example.org/test#Person")
659            .expect("Person entity");
660        assert_eq!(person.kind, EntityKind::Class);
661        assert_eq!(person.labels, vec!["\"Person\"".to_string()]);
662        assert!(person.source_location.line.is_some());
663
664        let knows = parsed
665            .entities
666            .iter()
667            .find(|e| e.iri == "http://example.org/test#knows")
668            .expect("knows property");
669        assert_eq!(knows.kind, EntityKind::ObjectProperty);
670        assert_eq!(knows.labels, vec!["\"knows\"".to_string()]);
671    }
672
673    #[test]
674    fn extracts_turtle_prefix_declarations() {
675        let dir = tempfile::tempdir().unwrap();
676        let path = dir.path().join("test.ttl");
677        let mut f = fs::File::create(&path).unwrap();
678        writeln!(
679            f,
680            r#"@prefix ex: <http://example.org/test#> .
681@prefix owl: <http://www.w3.org/2002/07/owl#> .
682
683<http://example.org/test> a owl:Ontology .
684ex:Person a owl:Class .
685"#
686        )
687        .unwrap();
688
689        let parsed =
690            parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
691
692        assert_eq!(parsed.base_iri.as_deref(), Some("http://example.org/test"));
693        assert_eq!(
694            parsed.namespaces.get("ex").map(String::as_str),
695            Some("http://example.org/test#")
696        );
697    }
698}