1use crate::vocab::{Rdf, Rdfs, OWL};
2use ontoindex_core::{
3 limits::{MAX_FILE_BYTES, MAX_TRIPLES_PER_FILE},
4 Annotation, Axiom, Entity, EntityKind, Import, Namespace, OntologyFormat, ParseStatus,
5 SourceLocation, AXIOM_KIND_SUB_CLASS_OF,
6};
7use oxigraph::io::{RdfFormat, RdfParseError, RdfParser};
8use oxigraph::model::{Quad, Subject, Term};
9use std::collections::{BTreeMap, BTreeSet, HashMap};
10use std::fs;
11use std::path::Path;
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum ParseError {
16 #[error("IO error: {0}")]
17 Io(#[from] std::io::Error),
18
19 #[error("RDF parse error: {0}")]
20 Rdf(String),
21
22 #[error("unsupported format: {0}")]
23 UnsupportedFormat(String),
24
25 #[error("limit exceeded: {0}")]
26 LimitExceeded(String),
27}
28
29pub type Result<T> = std::result::Result<T, ParseError>;
30
31#[derive(Debug, Clone)]
32pub struct ParsedOntology {
33 pub ontology_id: String,
34 pub base_iri: Option<String>,
35 pub imports: Vec<String>,
36 pub namespaces: BTreeMap<String, String>,
37 pub entities: Vec<Entity>,
38 pub annotations: Vec<Annotation>,
39 pub axioms: Vec<Axiom>,
40 pub namespace_rows: Vec<Namespace>,
41 pub import_rows: Vec<Import>,
42 pub parse_status: ParseStatus,
43 pub parse_message: Option<String>,
44 pub parse_error_location: Option<SourceLocation>,
45 pub triple_count: usize,
46 quads: Vec<Quad>,
47}
48
49impl ParsedOntology {
50 #[doc(hidden)]
52 pub fn quads(&self) -> &[Quad] {
53 &self.quads
54 }
55}
56
57pub fn parse_ontology_file(
58 path: &Path,
59 format: OntologyFormat,
60 ontology_id: &str,
61 content_hash: &str,
62 modified_time: u64,
63) -> Result<ParsedOntology> {
64 let _ = (content_hash, modified_time);
65 let metadata = fs::metadata(path)?;
66 if metadata.len() > MAX_FILE_BYTES {
67 return Err(ParseError::LimitExceeded(format!(
68 "file exceeds {MAX_FILE_BYTES} bytes: {}",
69 path.display()
70 )));
71 }
72 let content = fs::read(path)?;
73 let source_text = String::from_utf8_lossy(&content).into_owned();
74 parse_ontology_text(path, format, ontology_id, &source_text, &content)
75}
76
77pub fn parse_ontology_text(
79 path: &Path,
80 format: OntologyFormat,
81 ontology_id: &str,
82 source_text: &str,
83 raw_bytes: &[u8],
84) -> Result<ParsedOntology> {
85 if raw_bytes.len() as u64 > MAX_FILE_BYTES {
86 return Err(ParseError::LimitExceeded(format!(
87 "source exceeds {MAX_FILE_BYTES} bytes: {}",
88 path.display()
89 )));
90 }
91 let rdf_format = to_rdf_format(format, path)?;
92
93 let mut quads = Vec::new();
94 let mut parse_message = None;
95 let mut parse_error_location = None;
96 let mut parse_status = ParseStatus::Ok;
97
98 let parser = RdfParser::from_format(rdf_format);
99 for quad in parser.for_reader(raw_bytes) {
100 match quad {
101 Ok(q) => {
102 if quads.len() >= MAX_TRIPLES_PER_FILE {
103 return Err(ParseError::LimitExceeded(format!(
104 "file exceeds {MAX_TRIPLES_PER_FILE} triples: {}",
105 path.display()
106 )));
107 }
108 quads.push(q);
109 }
110 Err(e) => {
111 parse_status = ParseStatus::Error;
112 parse_message = Some(format_parse_error(&e));
113 parse_error_location = extract_parse_error_location(&e, source_text);
114 break;
115 }
116 }
117 }
118
119 if parse_status == ParseStatus::Error {
120 return Ok(empty_result(
121 ontology_id,
122 parse_status,
123 parse_message,
124 parse_error_location,
125 BTreeMap::new(),
126 ));
127 }
128
129 let mut namespaces =
130 if format == OntologyFormat::TriG { extract_prefixes(&quads) } else { BTreeMap::new() };
131 namespaces.extend(extract_declared_prefixes(source_text, format));
132 if namespaces.is_empty() {
133 namespaces.insert("".to_string(), default_base_iri(path));
134 }
135
136 let mut builder = OntologyBuilder::new(ontology_id.to_string(), namespaces.clone());
137 for quad in &quads {
138 builder.ingest_quad(quad);
139 }
140 builder.finish(parse_status, parse_message, parse_error_location, source_text, quads)
141}
142
143fn to_rdf_format(format: OntologyFormat, path: &Path) -> Result<RdfFormat> {
144 match format {
145 OntologyFormat::Turtle => Ok(RdfFormat::Turtle),
146 OntologyFormat::RdfXml | OntologyFormat::Owl => Ok(RdfFormat::RdfXml),
147 OntologyFormat::JsonLd => Ok(RdfFormat::JsonLd { profile: Default::default() }),
148 OntologyFormat::NTriples => Ok(RdfFormat::NTriples),
149 OntologyFormat::NQuads => Ok(RdfFormat::NQuads),
150 OntologyFormat::TriG => Ok(RdfFormat::TriG),
151 OntologyFormat::Unknown => Err(ParseError::UnsupportedFormat(path.display().to_string())),
152 }
153}
154
155fn format_parse_error(error: &RdfParseError) -> String {
156 error.to_string()
157}
158
159fn extract_parse_error_location(
160 error: &RdfParseError,
161 _source_text: &str,
162) -> Option<SourceLocation> {
163 let msg = error.to_string();
164 let line = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
165 if w[0].eq_ignore_ascii_case("line") {
166 w[1].trim_end_matches(':').parse().ok()
167 } else {
168 None
169 }
170 });
171 let column = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
172 if w[0].eq_ignore_ascii_case("column") || w[0].eq_ignore_ascii_case("col") {
173 w[1].trim_end_matches(':').parse().ok()
174 } else {
175 None
176 }
177 });
178 if line.is_some() || column.is_some() {
179 Some(SourceLocation { line, column, ..Default::default() })
180 } else {
181 None
182 }
183}
184
185fn default_base_iri(path: &Path) -> String {
186 format!("file://{}", path.display())
187}
188
189fn extract_declared_prefixes(
190 source_text: &str,
191 format: OntologyFormat,
192) -> BTreeMap<String, String> {
193 let mut prefixes = BTreeMap::new();
194
195 if matches!(format, OntologyFormat::Turtle | OntologyFormat::TriG) {
196 for line in source_text.lines() {
197 let trimmed = line.trim();
198 let rest = trimmed
199 .strip_prefix("@prefix ")
200 .or_else(|| trimmed.strip_prefix("@PREFIX "))
201 .or_else(|| trimmed.strip_prefix("PREFIX "));
202 let Some(rest) = rest else {
203 continue;
204 };
205 let Some((prefix_part, iri_part)) = rest.split_once('<') else {
206 continue;
207 };
208 let prefix = prefix_part.trim().trim_end_matches(':');
209 let Some(iri) = iri_part.split('>').next() else {
210 continue;
211 };
212 prefixes.insert(prefix.to_string(), iri.to_string());
213 }
214 return prefixes;
215 }
216
217 if matches!(format, OntologyFormat::RdfXml | OntologyFormat::Owl) {
218 static XMLNS_ATTR: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
219 regex::Regex::new(r#"xmlns(?::([A-Za-z][\w-]*))?="([^"]+)""#).expect("xmlns regex")
220 });
221 for cap in XMLNS_ATTR.captures_iter(source_text) {
222 let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or("");
223 let iri = cap.get(2).map(|m| m.as_str()).unwrap_or("");
224 if !iri.is_empty() {
225 prefixes.insert(prefix.to_string(), iri.to_string());
226 }
227 }
228 }
229
230 prefixes
231}
232
233fn extract_prefixes(quads: &[Quad]) -> BTreeMap<String, String> {
234 let mut prefixes = BTreeMap::new();
235 for quad in quads {
236 if let oxigraph::model::GraphNameRef::NamedNode(graph) = quad.graph_name.as_ref() {
237 let iri = graph.as_str();
238 if let Some((prefix, _)) = iri.rsplit_once('#') {
239 if let Some((p, _)) = prefix.rsplit_once('/') {
240 prefixes
241 .entry(short_name_from_iri(p))
242 .or_insert_with(|| format!("{}/", prefix.trim_end_matches('#')));
243 }
244 }
245 }
246 }
247 prefixes
248}
249
250fn empty_result(
251 ontology_id: &str,
252 parse_status: ParseStatus,
253 parse_message: Option<String>,
254 parse_error_location: Option<SourceLocation>,
255 namespaces: BTreeMap<String, String>,
256) -> ParsedOntology {
257 ParsedOntology {
258 ontology_id: ontology_id.to_string(),
259 base_iri: namespaces.values().next().cloned(),
260 imports: Vec::new(),
261 namespaces: namespaces.clone(),
262 entities: Vec::new(),
263 annotations: Vec::new(),
264 axioms: Vec::new(),
265 namespace_rows: namespaces
266 .into_iter()
267 .map(|(prefix, iri)| Namespace { prefix, iri, ontology_id: ontology_id.to_string() })
268 .collect(),
269 import_rows: Vec::new(),
270 parse_status,
271 parse_message,
272 parse_error_location,
273 triple_count: 0,
274 quads: Vec::new(),
275 }
276}
277
278struct EntityState {
279 kind: EntityKind,
280 labels: Vec<String>,
281 comments: Vec<String>,
282 deprecated: bool,
283 types: BTreeSet<String>,
284}
285
286struct OntologyBuilder {
287 ontology_id: String,
288 namespaces: BTreeMap<String, String>,
289 entities: HashMap<String, EntityState>,
290 annotations: Vec<Annotation>,
291 axioms: Vec<Axiom>,
292 imports: BTreeSet<String>,
293 ontology_iris: BTreeSet<String>,
294 triple_count: usize,
295 axiom_counter: usize,
296}
297
298impl OntologyBuilder {
299 fn new(ontology_id: String, namespaces: BTreeMap<String, String>) -> Self {
300 Self {
301 ontology_id,
302 namespaces,
303 entities: HashMap::new(),
304 annotations: Vec::new(),
305 axioms: Vec::new(),
306 imports: BTreeSet::new(),
307 ontology_iris: BTreeSet::new(),
308 triple_count: 0,
309 axiom_counter: 0,
310 }
311 }
312
313 fn ingest_quad(&mut self, quad: &Quad) {
314 self.triple_count += 1;
315 let subject = subject_to_string(&quad.subject);
316 let predicate = quad.predicate.as_str().to_string();
317 let object = term_to_string(&quad.object);
318
319 if quad.predicate == Rdf::type_() {
320 if let Term::NamedNode(node) = &quad.object {
321 let type_iri = node.as_str();
322 if type_iri == OWL::ontology().as_str() {
323 self.ontology_iris.insert(subject.clone());
324 }
325 let kind = entity_kind_for_type(type_iri);
326 if kind != EntityKind::Other {
327 let entry =
328 self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
329 kind,
330 labels: Vec::new(),
331 comments: Vec::new(),
332 deprecated: false,
333 types: BTreeSet::new(),
334 });
335 entry.types.insert(type_iri.to_string());
336 if entry.kind == EntityKind::Other
337 || kind_priority(kind) > kind_priority(entry.kind)
338 {
339 entry.kind = kind;
340 }
341 }
342 }
343 return;
344 }
345
346 if quad.predicate == OWL::imports() {
347 self.imports.insert(object.clone());
348 return;
349 }
350
351 if quad.predicate == Rdfs::label() {
352 if let Some(entity) = self.entities.get_mut(&subject) {
353 entity.labels.push(object.clone());
354 } else {
355 self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
356 kind: EntityKind::Other,
357 labels: vec![object.clone()],
358 comments: Vec::new(),
359 deprecated: false,
360 types: BTreeSet::new(),
361 });
362 }
363 self.annotations.push(Annotation {
364 subject: subject.clone(),
365 predicate: predicate.clone(),
366 object: object.clone(),
367 ontology_id: self.ontology_id.clone(),
368 source_location: SourceLocation::default(),
369 });
370 return;
371 }
372
373 if quad.predicate == Rdfs::comment() {
374 if let Some(entity) = self.entities.get_mut(&subject) {
375 entity.comments.push(object.clone());
376 } else {
377 self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
378 kind: EntityKind::Other,
379 labels: Vec::new(),
380 comments: vec![object.clone()],
381 deprecated: false,
382 types: BTreeSet::new(),
383 });
384 }
385 self.annotations.push(Annotation {
386 subject: subject.clone(),
387 predicate: predicate.clone(),
388 object: object.clone(),
389 ontology_id: self.ontology_id.clone(),
390 source_location: SourceLocation::default(),
391 });
392 return;
393 }
394
395 if quad.predicate == OWL::deprecated() {
396 let entry = self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
397 kind: EntityKind::Other,
398 labels: Vec::new(),
399 comments: Vec::new(),
400 deprecated: false,
401 types: BTreeSet::new(),
402 });
403 entry.deprecated = ontoindex_core::parse_boolean_literal(&object).unwrap_or(false);
404 return;
405 }
406
407 if quad.predicate == Rdfs::sub_class_of() {
408 self.axiom_counter += 1;
409 self.axioms.push(Axiom {
410 id: format!("{}#axiom-{}", self.ontology_id, self.axiom_counter),
411 ontology_id: self.ontology_id.clone(),
412 subject: subject.clone(),
413 predicate: predicate.clone(),
414 object: object.clone(),
415 axiom_kind: AXIOM_KIND_SUB_CLASS_OF.to_string(),
416 source_location: SourceLocation::default(),
417 });
418 }
419 }
420
421 fn finish(
422 self,
423 parse_status: ParseStatus,
424 parse_message: Option<String>,
425 parse_error_location: Option<SourceLocation>,
426 source_text: &str,
427 quads: Vec<Quad>,
428 ) -> Result<ParsedOntology> {
429 let base_iri = self
430 .ontology_iris
431 .iter()
432 .next()
433 .cloned()
434 .or_else(|| self.namespaces.get("").cloned())
435 .or_else(|| self.namespaces.values().next().cloned());
436
437 let ontology_id = if let Some(iri) = self.ontology_iris.iter().next() {
438 iri.clone()
439 } else {
440 self.ontology_id.clone()
441 };
442
443 let mut entities = Vec::new();
444 for (iri, state) in &self.entities {
445 if state.kind == EntityKind::Other {
446 continue;
447 }
448 let short_name = short_name_from_iri(iri);
449 entities.push(Entity {
450 iri: iri.clone(),
451 short_name: short_name.clone(),
452 kind: state.kind,
453 ontology_id: ontology_id.clone(),
454 source_location: find_entity_source_location(
455 source_text,
456 iri,
457 &short_name,
458 &self.namespaces,
459 ),
460 labels: state.labels.clone(),
461 comments: state.comments.clone(),
462 deprecated: state.deprecated,
463 });
464 }
465 entities.sort_by(|a, b| a.iri.cmp(&b.iri));
466
467 let namespace_rows = self
468 .namespaces
469 .iter()
470 .map(|(prefix, iri)| Namespace {
471 prefix: prefix.clone(),
472 iri: iri.clone(),
473 ontology_id: ontology_id.clone(),
474 })
475 .collect();
476
477 let import_rows = self
478 .imports
479 .iter()
480 .map(|import_iri| Import {
481 ontology_id: ontology_id.clone(),
482 import_iri: import_iri.clone(),
483 })
484 .collect();
485
486 let mut annotations = self.annotations;
487 for ann in &mut annotations {
488 ann.ontology_id = ontology_id.clone();
489 }
490 let mut axioms = self.axioms;
491 for axiom in &mut axioms {
492 axiom.ontology_id = ontology_id.clone();
493 }
494
495 Ok(ParsedOntology {
496 ontology_id,
497 base_iri,
498 imports: self.imports.into_iter().collect(),
499 namespaces: self.namespaces.clone(),
500 entities,
501 annotations,
502 axioms,
503 namespace_rows,
504 import_rows,
505 parse_status,
506 parse_message,
507 parse_error_location,
508 triple_count: self.triple_count,
509 quads,
510 })
511 }
512}
513
514fn entity_kind_for_type(type_iri: &str) -> EntityKind {
515 match type_iri {
516 t if t == OWL::class().as_str() || t == Rdfs::class().as_str() => EntityKind::Class,
517 t if t == OWL::object_property().as_str() => EntityKind::ObjectProperty,
518 t if t == OWL::datatype_property().as_str() => EntityKind::DataProperty,
519 t if t == OWL::annotation_property().as_str() => EntityKind::AnnotationProperty,
520 t if t == OWL::named_individual().as_str() => EntityKind::Individual,
521 t if t == OWL::ontology().as_str() => EntityKind::Ontology,
522 _ => EntityKind::Other,
523 }
524}
525
526fn kind_priority(kind: EntityKind) -> u8 {
527 match kind {
528 EntityKind::Class => 5,
529 EntityKind::ObjectProperty => 5,
530 EntityKind::DataProperty => 5,
531 EntityKind::AnnotationProperty => 5,
532 EntityKind::Individual => 4,
533 EntityKind::Ontology => 3,
534 EntityKind::Other => 0,
535 }
536}
537
538fn subject_to_string(subject: &Subject) -> String {
539 match subject {
540 Subject::NamedNode(node) => node.as_str().to_string(),
541 Subject::BlankNode(node) => format!("_:{}", node.as_str()),
542 #[allow(unreachable_patterns)]
543 _ => subject.to_string(),
544 }
545}
546
547fn term_to_string(term: &Term) -> String {
548 match term {
549 Term::NamedNode(node) => node.as_str().to_string(),
550 Term::BlankNode(node) => format!("_:{}", node.as_str()),
551 Term::Literal(lit) => lit.to_string(),
552 #[allow(unreachable_patterns)]
553 _ => term.to_string(),
554 }
555}
556
557fn find_entity_source_location(
558 source_text: &str,
559 iri: &str,
560 short_name: &str,
561 namespaces: &BTreeMap<String, String>,
562) -> SourceLocation {
563 let mut needles = vec![iri.to_string(), format!("<{iri}>"), format!(":{short_name}")];
564 for (prefix, ns) in namespaces {
565 if iri.starts_with(ns) && !prefix.is_empty() {
566 needles.push(format!("{prefix}:{short_name}"));
567 }
568 }
569 for line in source_text.lines() {
570 let trimmed = line.trim();
571 if !(trimmed.starts_with("@prefix")
572 || trimmed.starts_with("@PREFIX")
573 || trimmed.starts_with("PREFIX "))
574 {
575 continue;
576 }
577 if let Some(colon) = trimmed.find(':') {
578 let prefix = trimmed["@prefix ".len()..colon].trim();
579 let prefix = prefix.trim_start_matches('@');
580 if let (Some(start), Some(end)) = (line.find('<'), line.find('>')) {
581 let ns = &line[start + 1..end];
582 if iri.starts_with(ns) && !prefix.is_empty() {
583 needles.push(format!("{prefix}:{short_name}"));
584 }
585 }
586 }
587 }
588
589 for (line_idx, line) in source_text.lines().enumerate() {
590 for needle in &needles {
591 if let Some(col) = line.find(needle) {
592 return SourceLocation {
593 line: Some((line_idx + 1) as u64),
594 column: Some(col as u64),
595 start_byte: None,
596 end_byte: None,
597 };
598 }
599 }
600 }
601
602 SourceLocation::default()
603}
604
605fn short_name_from_iri(iri: &str) -> String {
606 if let Some((_, name)) = iri.rsplit_once('#') {
607 return name.to_string();
608 }
609 if let Some((_, name)) = iri.rsplit_once('/') {
610 return name.to_string();
611 }
612 iri.to_string()
613}
614
615#[cfg(test)]
616mod tests {
617 use super::*;
618 use std::io::Write;
619
620 #[test]
621 fn rejects_oversized_source_text() {
622 let dir = tempfile::tempdir().unwrap();
623 let path = dir.path().join("huge.ttl");
624 let oversized = "x".repeat((MAX_FILE_BYTES + 1) as usize);
625 let err = parse_ontology_text(
626 &path,
627 OntologyFormat::Turtle,
628 "doc-1",
629 &oversized,
630 oversized.as_bytes(),
631 )
632 .unwrap_err();
633 assert!(matches!(err, ParseError::LimitExceeded(_)));
634 }
635
636 #[test]
637 fn parses_simple_turtle_ontology() {
638 let dir = tempfile::tempdir().unwrap();
639 let path = dir.path().join("test.ttl");
640 let mut f = fs::File::create(&path).unwrap();
641 writeln!(
642 f,
643 r#"@prefix ex: <http://example.org/test#> .
644@prefix owl: <http://www.w3.org/2002/07/owl#> .
645@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
646
647<http://example.org/test> a owl:Ontology .
648
649ex:Person a owl:Class ;
650 rdfs:label "Person" ;
651 rdfs:comment "A human being" .
652
653ex:knows a owl:ObjectProperty ;
654 rdfs:label "knows" .
655"#
656 )
657 .unwrap();
658
659 let parsed =
660 parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
661
662 assert_eq!(parsed.parse_status, ParseStatus::Ok);
663
664 let person = parsed
665 .entities
666 .iter()
667 .find(|e| e.iri == "http://example.org/test#Person")
668 .expect("Person entity");
669 assert_eq!(person.kind, EntityKind::Class);
670 assert_eq!(person.labels, vec!["\"Person\"".to_string()]);
671 assert!(person.source_location.line.is_some());
672
673 let knows = parsed
674 .entities
675 .iter()
676 .find(|e| e.iri == "http://example.org/test#knows")
677 .expect("knows property");
678 assert_eq!(knows.kind, EntityKind::ObjectProperty);
679 assert_eq!(knows.labels, vec!["\"knows\"".to_string()]);
680 }
681
682 #[test]
683 fn extracts_turtle_prefix_declarations() {
684 let dir = tempfile::tempdir().unwrap();
685 let path = dir.path().join("test.ttl");
686 let mut f = fs::File::create(&path).unwrap();
687 writeln!(
688 f,
689 r#"@prefix ex: <http://example.org/test#> .
690@prefix owl: <http://www.w3.org/2002/07/owl#> .
691
692<http://example.org/test> a owl:Ontology .
693ex:Person a owl:Class .
694"#
695 )
696 .unwrap();
697
698 let parsed =
699 parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
700
701 assert_eq!(parsed.base_iri.as_deref(), Some("http://example.org/test"));
702 assert_eq!(
703 parsed.namespaces.get("ex").map(String::as_str),
704 Some("http://example.org/test#")
705 );
706 }
707}