1use crate::vocab::{Rdf, Rdfs, OWL};
2use ontoindex_core::{
3 limits::{MAX_FILE_BYTES, MAX_TRIPLES_PER_FILE},
4 Annotation, Axiom, Entity, EntityKind, Import, Namespace, OntologyFormat, ParseStatus,
5 SourceLocation, AXIOM_KIND_SUB_CLASS_OF,
6};
7use oxigraph::io::{RdfFormat, RdfParseError, RdfParser};
8use oxigraph::model::{Quad, Subject, Term};
9use std::collections::{BTreeMap, BTreeSet, HashMap};
10use std::fs;
11use std::path::Path;
12use thiserror::Error;
13
14#[derive(Debug, Error)]
15pub enum ParseError {
16 #[error("IO error: {0}")]
17 Io(#[from] std::io::Error),
18
19 #[error("RDF parse error: {0}")]
20 Rdf(String),
21
22 #[error("unsupported format: {0}")]
23 UnsupportedFormat(String),
24
25 #[error("limit exceeded: {0}")]
26 LimitExceeded(String),
27}
28
29pub type Result<T> = std::result::Result<T, ParseError>;
30
31#[derive(Debug, Clone)]
32pub struct ParsedOntology {
33 pub ontology_id: String,
34 pub base_iri: Option<String>,
35 pub imports: Vec<String>,
36 pub namespaces: BTreeMap<String, String>,
37 pub entities: Vec<Entity>,
38 pub annotations: Vec<Annotation>,
39 pub axioms: Vec<Axiom>,
40 pub namespace_rows: Vec<Namespace>,
41 pub import_rows: Vec<Import>,
42 pub parse_status: ParseStatus,
43 pub parse_message: Option<String>,
44 pub parse_error_location: Option<SourceLocation>,
45 pub triple_count: usize,
46 quads: Vec<Quad>,
47}
48
49impl ParsedOntology {
50 #[doc(hidden)]
52 pub fn quads(&self) -> &[Quad] {
53 &self.quads
54 }
55}
56
57pub fn parse_ontology_file(
58 path: &Path,
59 format: OntologyFormat,
60 ontology_id: &str,
61 content_hash: &str,
62 modified_time: u64,
63) -> Result<ParsedOntology> {
64 let _ = (content_hash, modified_time);
65 let metadata = fs::metadata(path)?;
66 if metadata.len() > MAX_FILE_BYTES {
67 return Err(ParseError::LimitExceeded(format!(
68 "file exceeds {MAX_FILE_BYTES} bytes: {}",
69 path.display()
70 )));
71 }
72 let content = fs::read(path)?;
73 let source_text = String::from_utf8_lossy(&content).into_owned();
74 parse_ontology_text(path, format, ontology_id, &source_text, &content)
75}
76
77pub fn parse_ontology_text(
79 path: &Path,
80 format: OntologyFormat,
81 ontology_id: &str,
82 source_text: &str,
83 raw_bytes: &[u8],
84) -> Result<ParsedOntology> {
85 if raw_bytes.len() as u64 > MAX_FILE_BYTES {
86 return Err(ParseError::LimitExceeded(format!(
87 "source exceeds {MAX_FILE_BYTES} bytes: {}",
88 path.display()
89 )));
90 }
91 let rdf_format = to_rdf_format(format, path)?;
92
93 let mut quads = Vec::new();
94 let mut parse_message = None;
95 let mut parse_error_location = None;
96 let mut parse_status = ParseStatus::Ok;
97
98 let parser = RdfParser::from_format(rdf_format);
99 for quad in parser.for_reader(raw_bytes) {
100 match quad {
101 Ok(q) => {
102 if quads.len() >= MAX_TRIPLES_PER_FILE {
103 return Err(ParseError::LimitExceeded(format!(
104 "file exceeds {MAX_TRIPLES_PER_FILE} triples: {}",
105 path.display()
106 )));
107 }
108 quads.push(q);
109 }
110 Err(e) => {
111 parse_status = ParseStatus::Error;
112 parse_message = Some(format_parse_error(&e));
113 parse_error_location = extract_parse_error_location(&e, source_text);
114 break;
115 }
116 }
117 }
118
119 if parse_status == ParseStatus::Error {
120 return Ok(empty_result(
121 ontology_id,
122 parse_status,
123 parse_message,
124 parse_error_location,
125 BTreeMap::new(),
126 ));
127 }
128
129 let mut namespaces =
130 if format == OntologyFormat::TriG { extract_prefixes(&quads) } else { BTreeMap::new() };
131 namespaces.extend(extract_declared_prefixes(source_text, format));
132 if namespaces.is_empty() {
133 namespaces.insert("".to_string(), default_base_iri(path));
134 }
135
136 let mut builder = OntologyBuilder::new(ontology_id.to_string(), namespaces.clone());
137 for quad in &quads {
138 builder.ingest_quad(quad);
139 }
140 builder.finish(parse_status, parse_message, parse_error_location, source_text, quads)
141}
142
143fn to_rdf_format(format: OntologyFormat, path: &Path) -> Result<RdfFormat> {
144 match format {
145 OntologyFormat::Turtle => Ok(RdfFormat::Turtle),
146 OntologyFormat::RdfXml | OntologyFormat::Owl => Ok(RdfFormat::RdfXml),
147 OntologyFormat::JsonLd => Ok(RdfFormat::JsonLd { profile: Default::default() }),
148 OntologyFormat::NTriples => Ok(RdfFormat::NTriples),
149 OntologyFormat::NQuads => Ok(RdfFormat::NQuads),
150 OntologyFormat::TriG => Ok(RdfFormat::TriG),
151 OntologyFormat::Unknown => Err(ParseError::UnsupportedFormat(path.display().to_string())),
152 }
153}
154
155fn format_parse_error(error: &RdfParseError) -> String {
156 error.to_string()
157}
158
159fn extract_parse_error_location(
160 error: &RdfParseError,
161 _source_text: &str,
162) -> Option<SourceLocation> {
163 let msg = error.to_string();
164 let line = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
165 if w[0].eq_ignore_ascii_case("line") {
166 w[1].trim_end_matches(':').parse().ok()
167 } else {
168 None
169 }
170 });
171 let column = msg.split_whitespace().collect::<Vec<_>>().windows(2).find_map(|w| {
172 if w[0].eq_ignore_ascii_case("column") || w[0].eq_ignore_ascii_case("col") {
173 w[1].trim_end_matches(':').parse().ok()
174 } else {
175 None
176 }
177 });
178 if line.is_some() || column.is_some() {
179 Some(SourceLocation { line, column, ..Default::default() })
180 } else {
181 None
182 }
183}
184
185fn default_base_iri(path: &Path) -> String {
186 format!("file://{}", path.display())
187}
188
189fn extract_declared_prefixes(
190 source_text: &str,
191 format: OntologyFormat,
192) -> BTreeMap<String, String> {
193 let mut prefixes = BTreeMap::new();
194
195 if matches!(format, OntologyFormat::Turtle | OntologyFormat::TriG) {
196 for line in source_text.lines() {
197 let trimmed = line.trim();
198 let rest = trimmed
199 .strip_prefix("@prefix ")
200 .or_else(|| trimmed.strip_prefix("@PREFIX "))
201 .or_else(|| trimmed.strip_prefix("PREFIX "));
202 let Some(rest) = rest else {
203 continue;
204 };
205 let Some((prefix_part, iri_part)) = rest.split_once('<') else {
206 continue;
207 };
208 let prefix = prefix_part.trim().trim_end_matches(':');
209 let Some(iri) = iri_part.split('>').next() else {
210 continue;
211 };
212 prefixes.insert(prefix.to_string(), iri.to_string());
213 }
214 return prefixes;
215 }
216
217 if matches!(format, OntologyFormat::RdfXml | OntologyFormat::Owl) {
218 static XMLNS_ATTR: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
219 regex::Regex::new(r#"xmlns(?::([A-Za-z][\w-]*))?="([^"]+)""#).expect("xmlns regex")
220 });
221 for cap in XMLNS_ATTR.captures_iter(source_text) {
222 let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or("");
223 let iri = cap.get(2).map(|m| m.as_str()).unwrap_or("");
224 if !iri.is_empty() {
225 prefixes.insert(prefix.to_string(), iri.to_string());
226 }
227 }
228 }
229
230 prefixes
231}
232
233fn extract_prefixes(quads: &[Quad]) -> BTreeMap<String, String> {
234 let mut prefixes = BTreeMap::new();
235 for quad in quads {
236 if let oxigraph::model::GraphNameRef::NamedNode(graph) = quad.graph_name.as_ref() {
237 let iri = graph.as_str();
238 if let Some((prefix, _)) = iri.rsplit_once('#') {
239 if let Some((p, _)) = prefix.rsplit_once('/') {
240 prefixes
241 .entry(short_name_from_iri(p))
242 .or_insert_with(|| format!("{}/", prefix.trim_end_matches('#')));
243 }
244 }
245 }
246 }
247 prefixes
248}
249
250fn empty_result(
251 ontology_id: &str,
252 parse_status: ParseStatus,
253 parse_message: Option<String>,
254 parse_error_location: Option<SourceLocation>,
255 namespaces: BTreeMap<String, String>,
256) -> ParsedOntology {
257 ParsedOntology {
258 ontology_id: ontology_id.to_string(),
259 base_iri: namespaces.values().next().cloned(),
260 imports: Vec::new(),
261 namespaces: namespaces.clone(),
262 entities: Vec::new(),
263 annotations: Vec::new(),
264 axioms: Vec::new(),
265 namespace_rows: namespaces
266 .into_iter()
267 .map(|(prefix, iri)| Namespace { prefix, iri, ontology_id: ontology_id.to_string() })
268 .collect(),
269 import_rows: Vec::new(),
270 parse_status,
271 parse_message,
272 parse_error_location,
273 triple_count: 0,
274 quads: Vec::new(),
275 }
276}
277
278struct EntityState {
279 kind: EntityKind,
280 labels: Vec<String>,
281 comments: Vec<String>,
282 deprecated: bool,
283 types: BTreeSet<String>,
284}
285
286struct OntologyBuilder {
287 ontology_id: String,
288 namespaces: BTreeMap<String, String>,
289 entities: HashMap<String, EntityState>,
290 annotations: Vec<Annotation>,
291 axioms: Vec<Axiom>,
292 imports: BTreeSet<String>,
293 ontology_iris: BTreeSet<String>,
294 triple_count: usize,
295 axiom_counter: usize,
296}
297
298impl OntologyBuilder {
299 fn new(ontology_id: String, namespaces: BTreeMap<String, String>) -> Self {
300 Self {
301 ontology_id,
302 namespaces,
303 entities: HashMap::new(),
304 annotations: Vec::new(),
305 axioms: Vec::new(),
306 imports: BTreeSet::new(),
307 ontology_iris: BTreeSet::new(),
308 triple_count: 0,
309 axiom_counter: 0,
310 }
311 }
312
313 fn ingest_quad(&mut self, quad: &Quad) {
314 self.triple_count += 1;
315 let subject = subject_to_string(&quad.subject);
316 let predicate = quad.predicate.as_str().to_string();
317 let object = term_to_string(&quad.object);
318
319 if quad.predicate == Rdf::type_() {
320 if let Term::NamedNode(node) = &quad.object {
321 let type_iri = node.as_str();
322 if type_iri == OWL::ontology().as_str() {
323 self.ontology_iris.insert(subject.clone());
324 }
325 let kind = entity_kind_for_type(type_iri);
326 if kind != EntityKind::Other {
327 let entry =
328 self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
329 kind,
330 labels: Vec::new(),
331 comments: Vec::new(),
332 deprecated: false,
333 types: BTreeSet::new(),
334 });
335 entry.types.insert(type_iri.to_string());
336 if entry.kind == EntityKind::Other
337 || kind_priority(kind) > kind_priority(entry.kind)
338 {
339 entry.kind = kind;
340 }
341 }
342 }
343 return;
344 }
345
346 if quad.predicate == OWL::imports() {
347 self.imports.insert(object.clone());
348 return;
349 }
350
351 if quad.predicate == Rdfs::label() {
352 if let Some(entity) = self.entities.get_mut(&subject) {
353 entity.labels.push(object.clone());
354 } else {
355 self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
356 kind: EntityKind::Other,
357 labels: vec![object.clone()],
358 comments: Vec::new(),
359 deprecated: false,
360 types: BTreeSet::new(),
361 });
362 }
363 self.annotations.push(Annotation {
364 subject: subject.clone(),
365 predicate: predicate.clone(),
366 object: object.clone(),
367 ontology_id: self.ontology_id.clone(),
368 source_location: SourceLocation::default(),
369 });
370 return;
371 }
372
373 if quad.predicate == Rdfs::comment() {
374 if let Some(entity) = self.entities.get_mut(&subject) {
375 entity.comments.push(object.clone());
376 } else {
377 self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
378 kind: EntityKind::Other,
379 labels: Vec::new(),
380 comments: vec![object.clone()],
381 deprecated: false,
382 types: BTreeSet::new(),
383 });
384 }
385 self.annotations.push(Annotation {
386 subject: subject.clone(),
387 predicate: predicate.clone(),
388 object: object.clone(),
389 ontology_id: self.ontology_id.clone(),
390 source_location: SourceLocation::default(),
391 });
392 return;
393 }
394
395 if quad.predicate == OWL::deprecated() {
396 let entry = self.entities.entry(subject.clone()).or_insert_with(|| EntityState {
397 kind: EntityKind::Other,
398 labels: Vec::new(),
399 comments: Vec::new(),
400 deprecated: false,
401 types: BTreeSet::new(),
402 });
403 entry.deprecated = object == "true" || object.contains("true");
404 return;
405 }
406
407 if quad.predicate == Rdfs::sub_class_of() {
408 self.axiom_counter += 1;
409 self.axioms.push(Axiom {
410 id: format!("{}#axiom-{}", self.ontology_id, self.axiom_counter),
411 ontology_id: self.ontology_id.clone(),
412 subject: subject.clone(),
413 predicate: predicate.clone(),
414 object: object.clone(),
415 axiom_kind: AXIOM_KIND_SUB_CLASS_OF.to_string(),
416 source_location: SourceLocation::default(),
417 });
418 }
419 }
420
421 fn finish(
422 self,
423 parse_status: ParseStatus,
424 parse_message: Option<String>,
425 parse_error_location: Option<SourceLocation>,
426 source_text: &str,
427 quads: Vec<Quad>,
428 ) -> Result<ParsedOntology> {
429 let base_iri = self
430 .ontology_iris
431 .iter()
432 .next()
433 .cloned()
434 .or_else(|| self.namespaces.get("").cloned())
435 .or_else(|| self.namespaces.values().next().cloned());
436
437 let ontology_id = if let Some(iri) = self.ontology_iris.iter().next() {
438 iri.clone()
439 } else {
440 self.ontology_id.clone()
441 };
442
443 let mut entities = Vec::new();
444 for (iri, state) in &self.entities {
445 if state.kind == EntityKind::Other {
446 continue;
447 }
448 let short_name = short_name_from_iri(iri);
449 entities.push(Entity {
450 iri: iri.clone(),
451 short_name: short_name.clone(),
452 kind: state.kind,
453 ontology_id: ontology_id.clone(),
454 source_location: find_entity_source_location(
455 source_text,
456 iri,
457 &short_name,
458 &self.namespaces,
459 ),
460 labels: state.labels.clone(),
461 comments: state.comments.clone(),
462 deprecated: state.deprecated,
463 });
464 }
465 entities.sort_by(|a, b| a.iri.cmp(&b.iri));
466
467 let namespace_rows = self
468 .namespaces
469 .iter()
470 .map(|(prefix, iri)| Namespace {
471 prefix: prefix.clone(),
472 iri: iri.clone(),
473 ontology_id: ontology_id.clone(),
474 })
475 .collect();
476
477 let import_rows = self
478 .imports
479 .iter()
480 .map(|import_iri| Import {
481 ontology_id: ontology_id.clone(),
482 import_iri: import_iri.clone(),
483 })
484 .collect();
485
486 Ok(ParsedOntology {
487 ontology_id,
488 base_iri,
489 imports: self.imports.into_iter().collect(),
490 namespaces: self.namespaces.clone(),
491 entities,
492 annotations: self.annotations,
493 axioms: self.axioms,
494 namespace_rows,
495 import_rows,
496 parse_status,
497 parse_message,
498 parse_error_location,
499 triple_count: self.triple_count,
500 quads,
501 })
502 }
503}
504
505fn entity_kind_for_type(type_iri: &str) -> EntityKind {
506 match type_iri {
507 t if t == OWL::class().as_str() || t == Rdfs::class().as_str() => EntityKind::Class,
508 t if t == OWL::object_property().as_str() => EntityKind::ObjectProperty,
509 t if t == OWL::datatype_property().as_str() => EntityKind::DataProperty,
510 t if t == OWL::annotation_property().as_str() => EntityKind::AnnotationProperty,
511 t if t == OWL::named_individual().as_str() => EntityKind::Individual,
512 t if t == OWL::ontology().as_str() => EntityKind::Ontology,
513 _ => EntityKind::Other,
514 }
515}
516
517fn kind_priority(kind: EntityKind) -> u8 {
518 match kind {
519 EntityKind::Class => 5,
520 EntityKind::ObjectProperty => 5,
521 EntityKind::DataProperty => 5,
522 EntityKind::AnnotationProperty => 5,
523 EntityKind::Individual => 4,
524 EntityKind::Ontology => 3,
525 EntityKind::Other => 0,
526 }
527}
528
529fn subject_to_string(subject: &Subject) -> String {
530 match subject {
531 Subject::NamedNode(node) => node.as_str().to_string(),
532 Subject::BlankNode(node) => format!("_:{}", node.as_str()),
533 #[allow(unreachable_patterns)]
534 _ => subject.to_string(),
535 }
536}
537
538fn term_to_string(term: &Term) -> String {
539 match term {
540 Term::NamedNode(node) => node.as_str().to_string(),
541 Term::BlankNode(node) => format!("_:{}", node.as_str()),
542 Term::Literal(lit) => lit.to_string(),
543 #[allow(unreachable_patterns)]
544 _ => term.to_string(),
545 }
546}
547
548fn find_entity_source_location(
549 source_text: &str,
550 iri: &str,
551 short_name: &str,
552 namespaces: &BTreeMap<String, String>,
553) -> SourceLocation {
554 let mut needles = vec![iri.to_string(), format!("<{iri}>"), format!(":{short_name}")];
555 for (prefix, ns) in namespaces {
556 if iri.starts_with(ns) && !prefix.is_empty() {
557 needles.push(format!("{prefix}:{short_name}"));
558 }
559 }
560 for line in source_text.lines() {
561 let trimmed = line.trim();
562 if !(trimmed.starts_with("@prefix")
563 || trimmed.starts_with("@PREFIX")
564 || trimmed.starts_with("PREFIX "))
565 {
566 continue;
567 }
568 if let Some(colon) = trimmed.find(':') {
569 let prefix = trimmed["@prefix ".len()..colon].trim();
570 let prefix = prefix.trim_start_matches('@');
571 if let (Some(start), Some(end)) = (line.find('<'), line.find('>')) {
572 let ns = &line[start + 1..end];
573 if iri.starts_with(ns) && !prefix.is_empty() {
574 needles.push(format!("{prefix}:{short_name}"));
575 }
576 }
577 }
578 }
579
580 for (line_idx, line) in source_text.lines().enumerate() {
581 for needle in &needles {
582 if let Some(col) = line.find(needle) {
583 return SourceLocation {
584 line: Some((line_idx + 1) as u64),
585 column: Some(col as u64),
586 start_byte: None,
587 end_byte: None,
588 };
589 }
590 }
591 }
592
593 SourceLocation::default()
594}
595
596fn short_name_from_iri(iri: &str) -> String {
597 if let Some((_, name)) = iri.rsplit_once('#') {
598 return name.to_string();
599 }
600 if let Some((_, name)) = iri.rsplit_once('/') {
601 return name.to_string();
602 }
603 iri.to_string()
604}
605
606#[cfg(test)]
607mod tests {
608 use super::*;
609 use std::io::Write;
610
611 #[test]
612 fn rejects_oversized_source_text() {
613 let dir = tempfile::tempdir().unwrap();
614 let path = dir.path().join("huge.ttl");
615 let oversized = "x".repeat((MAX_FILE_BYTES + 1) as usize);
616 let err = parse_ontology_text(
617 &path,
618 OntologyFormat::Turtle,
619 "doc-1",
620 &oversized,
621 oversized.as_bytes(),
622 )
623 .unwrap_err();
624 assert!(matches!(err, ParseError::LimitExceeded(_)));
625 }
626
627 #[test]
628 fn parses_simple_turtle_ontology() {
629 let dir = tempfile::tempdir().unwrap();
630 let path = dir.path().join("test.ttl");
631 let mut f = fs::File::create(&path).unwrap();
632 writeln!(
633 f,
634 r#"@prefix ex: <http://example.org/test#> .
635@prefix owl: <http://www.w3.org/2002/07/owl#> .
636@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
637
638<http://example.org/test> a owl:Ontology .
639
640ex:Person a owl:Class ;
641 rdfs:label "Person" ;
642 rdfs:comment "A human being" .
643
644ex:knows a owl:ObjectProperty ;
645 rdfs:label "knows" .
646"#
647 )
648 .unwrap();
649
650 let parsed =
651 parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
652
653 assert_eq!(parsed.parse_status, ParseStatus::Ok);
654
655 let person = parsed
656 .entities
657 .iter()
658 .find(|e| e.iri == "http://example.org/test#Person")
659 .expect("Person entity");
660 assert_eq!(person.kind, EntityKind::Class);
661 assert_eq!(person.labels, vec!["\"Person\"".to_string()]);
662 assert!(person.source_location.line.is_some());
663
664 let knows = parsed
665 .entities
666 .iter()
667 .find(|e| e.iri == "http://example.org/test#knows")
668 .expect("knows property");
669 assert_eq!(knows.kind, EntityKind::ObjectProperty);
670 assert_eq!(knows.labels, vec!["\"knows\"".to_string()]);
671 }
672
673 #[test]
674 fn extracts_turtle_prefix_declarations() {
675 let dir = tempfile::tempdir().unwrap();
676 let path = dir.path().join("test.ttl");
677 let mut f = fs::File::create(&path).unwrap();
678 writeln!(
679 f,
680 r#"@prefix ex: <http://example.org/test#> .
681@prefix owl: <http://www.w3.org/2002/07/owl#> .
682
683<http://example.org/test> a owl:Ontology .
684ex:Person a owl:Class .
685"#
686 )
687 .unwrap();
688
689 let parsed =
690 parse_ontology_file(&path, OntologyFormat::Turtle, "doc-1", "hash", 0).unwrap();
691
692 assert_eq!(parsed.base_iri.as_deref(), Some("http://example.org/test"));
693 assert_eq!(
694 parsed.namespaces.get("ex").map(String::as_str),
695 Some("http://example.org/test#")
696 );
697 }
698}