Skip to main content

cognee_ontology/
loader.rs

1//! RDF/OWL ontology file loading using sophia.
2//!
3//! Provides format auto-detection and multi-file merging with
4//! permissive error handling (logs warnings, continues with valid files).
5
6use sophia_api::graph::{Graph, MutableGraph};
7use sophia_api::parser::TripleParser;
8use sophia_api::prelude::{Quad, QuadParser, QuadSource};
9use sophia_api::source::TripleSource;
10use sophia_inmem::graph::FastGraph;
11use sophia_jsonld::JsonLdParser;
12use sophia_turtle::parser::turtle;
13use sophia_xml::parser::RdfXmlParser;
14use std::io::Read;
15use std::path::{Path, PathBuf};
16use tracing::{info, warn};
17
18use crate::error::{OntologyError, OntologyResult};
19
20/// Input sources for ontology files.
21///
22/// Supports single/multiple file paths and file-like readers.
23pub enum OntologyFileInput {
24    /// Single file path
25    Path(PathBuf),
26    /// Multiple file paths
27    Paths(Vec<PathBuf>),
28    /// Single reader (e.g., in-memory buffer)
29    Reader(Box<dyn Read>),
30    /// Multiple readers
31    Readers(Vec<Box<dyn Read>>),
32}
33
34impl From<PathBuf> for OntologyFileInput {
35    fn from(path: PathBuf) -> Self {
36        OntologyFileInput::Path(path)
37    }
38}
39
40impl From<Vec<PathBuf>> for OntologyFileInput {
41    fn from(paths: Vec<PathBuf>) -> Self {
42        OntologyFileInput::Paths(paths)
43    }
44}
45
46impl<'a> From<&'a str> for OntologyFileInput {
47    fn from(path: &'a str) -> Self {
48        OntologyFileInput::Path(PathBuf::from(path))
49    }
50}
51
52impl From<Vec<&str>> for OntologyFileInput {
53    fn from(paths: Vec<&str>) -> Self {
54        OntologyFileInput::Paths(paths.into_iter().map(PathBuf::from).collect())
55    }
56}
57
58/// Detect RDF format from file extension.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60enum RdfFormat {
61    Turtle,   // .ttl
62    RdfXml,   // .rdf, .owl, .xml
63    NTriples, // .nt
64    JsonLd,   // .jsonld
65}
66
67impl RdfFormat {
68    /// Detect format from file extension.
69    fn from_path(path: &Path) -> Option<Self> {
70        path.extension()
71            .and_then(|ext| ext.to_str())
72            .and_then(|ext| match ext.to_lowercase().as_str() {
73                "ttl" => Some(RdfFormat::Turtle),
74                "rdf" | "owl" | "xml" => Some(RdfFormat::RdfXml),
75                "nt" => Some(RdfFormat::NTriples),
76                "jsonld" => Some(RdfFormat::JsonLd),
77                _ => None,
78            })
79    }
80}
81
82/// Load ontology files and merge into a single graph.
83///
84/// Matches Python's permissive error handling: logs warnings for missing
85/// files but continues with valid files. Returns `None` if all files fail.
86pub fn load_ontology_files(input: OntologyFileInput) -> OntologyResult<Option<FastGraph>> {
87    match input {
88        OntologyFileInput::Path(path) => load_single_path(&path),
89        OntologyFileInput::Paths(paths) => load_multiple_paths(&paths),
90        OntologyFileInput::Reader(reader) => load_single_reader(reader),
91        OntologyFileInput::Readers(readers) => load_multiple_readers(readers),
92    }
93}
94
95fn load_single_path(path: &Path) -> OntologyResult<Option<FastGraph>> {
96    if !path.exists() {
97        warn!(
98            "Ontology file '{}' not found. Skipping this file.",
99            path.display()
100        );
101        return Ok(None);
102    }
103
104    let content = std::fs::read_to_string(path).map_err(|e| {
105        OntologyError::FileNotFound(format!("Failed to read file '{}': {}", path.display(), e))
106    })?;
107
108    let format = RdfFormat::from_path(path).ok_or_else(|| {
109        OntologyError::ParseError(format!(
110            "Unknown RDF format for file '{}'. Supported: .ttl, .rdf, .owl, .nt, .jsonld",
111            path.display()
112        ))
113    })?;
114
115    let parse_result = match format {
116        RdfFormat::Turtle => parse_turtle_with_path_base(path, &content),
117        _ => parse_rdf(&content, format),
118    };
119
120    match parse_result {
121        Ok(graph) => {
122            info!("Ontology loaded successfully from file: {}", path.display());
123            Ok(Some(graph))
124        }
125        Err(e) => {
126            warn!("Failed to parse ontology file '{}': {}", path.display(), e);
127            Ok(None)
128        }
129    }
130}
131
132fn parse_turtle_with_path_base(path: &Path, content: &str) -> OntologyResult<FastGraph> {
133    let absolute = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
134    let base_iri = format!("file://{}", absolute.to_string_lossy());
135    let content_with_base = format!("@base <{base_iri}> .\n{content}");
136
137    parse_rdf(&content_with_base, RdfFormat::Turtle)
138}
139
140fn load_multiple_paths(paths: &[PathBuf]) -> OntologyResult<Option<FastGraph>> {
141    if paths.is_empty() {
142        info!("No ontology file provided. No owl ontology will be attached to the graph.");
143        return Ok(None);
144    }
145
146    let mut merged_graph = FastGraph::new();
147    let mut loaded_count = 0;
148
149    for path in paths {
150        match load_single_path(path) {
151            Ok(Some(graph)) => {
152                merged_graph.insert_all(graph.triples()).map_err(|e| {
153                    OntologyError::ParseError(format!(
154                        "Failed to merge graph from '{}': {}",
155                        path.display(),
156                        e
157                    ))
158                })?;
159                loaded_count += 1;
160            }
161            Ok(None) => {}
162            Err(e) => warn!(
163                "Failed to process ontology file '{}': {}",
164                path.display(),
165                e
166            ),
167        }
168    }
169
170    if loaded_count == 0 {
171        info!("No valid ontology files found. No owl ontology will be attached to the graph.");
172        Ok(None)
173    } else {
174        info!("Total ontology files loaded: {}", loaded_count);
175        Ok(Some(merged_graph))
176    }
177}
178
179fn load_single_reader(mut reader: Box<dyn Read>) -> OntologyResult<Option<FastGraph>> {
180    let mut content = String::new();
181    reader
182        .read_to_string(&mut content)
183        .map_err(|e| OntologyError::FileNotFound(format!("Failed to read from reader: {e}")))?;
184
185    // Prefer RDF/XML (Python parity), but permissively fall back for other valid RDF payloads.
186    let parse_attempts = [
187        RdfFormat::RdfXml,
188        RdfFormat::Turtle,
189        RdfFormat::JsonLd,
190        RdfFormat::NTriples,
191    ];
192
193    let mut last_error: Option<OntologyError> = None;
194    let mut parsed_graph: Option<FastGraph> = None;
195
196    for format in parse_attempts {
197        match parse_rdf(&content, format) {
198            Ok(graph) => {
199                parsed_graph = Some(graph);
200                break;
201            }
202            Err(e) => last_error = Some(e),
203        }
204    }
205
206    match parsed_graph {
207        Some(graph) => {
208            info!("Ontology loaded successfully from reader");
209            Ok(Some(graph))
210        }
211        None => {
212            let err_message = last_error
213                .map(|e| e.to_string())
214                .unwrap_or_else(|| "Unknown parse error".to_string());
215            warn!("Failed to parse ontology from reader: {}", err_message);
216            Ok(None)
217        }
218    }
219}
220
221fn load_multiple_readers(readers: Vec<Box<dyn Read>>) -> OntologyResult<Option<FastGraph>> {
222    if readers.is_empty() {
223        info!("No ontology file provided. No owl ontology will be attached to the graph.");
224        return Ok(None);
225    }
226
227    let mut merged_graph = FastGraph::new();
228    let mut loaded_count = 0;
229
230    for reader in readers {
231        if let Some(graph) = load_single_reader(reader)? {
232            merged_graph.insert_all(graph.triples()).map_err(|e| {
233                OntologyError::ParseError(format!("Failed to merge graph from reader: {e}"))
234            })?;
235            loaded_count += 1;
236        }
237    }
238
239    if loaded_count == 0 {
240        info!("No valid ontology readers found. No owl ontology will be attached to the graph.");
241        Ok(None)
242    } else {
243        info!("Total ontology readers loaded: {}", loaded_count);
244        Ok(Some(merged_graph))
245    }
246}
247
248/// Parse RDF content with specified format.
249fn parse_rdf(content: &str, format: RdfFormat) -> OntologyResult<FastGraph> {
250    match format {
251        RdfFormat::Turtle | RdfFormat::NTriples => turtle::parse_str(content)
252            .collect_triples()
253            .map_err(|e| OntologyError::ParseError(format!("Turtle/N-Triples parse error: {e}"))),
254        RdfFormat::RdfXml => RdfXmlParser::default()
255            .parse_str(content)
256            .collect_triples()
257            .map_err(|e| OntologyError::ParseError(format!("RDF/XML parse error: {e}"))),
258        RdfFormat::JsonLd => JsonLdParser::new()
259            .parse_str(content)
260            .filter_quads(|q| q.g().is_none())
261            .map_quads(Quad::into_triple)
262            .collect_triples()
263            .map_err(|e| OntologyError::ParseError(format!("JSON-LD parse error: {e}"))),
264    }
265}
266
267#[cfg(test)]
268#[allow(
269    clippy::unwrap_used,
270    clippy::expect_used,
271    reason = "test code — panics are acceptable failures"
272)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn test_format_detection_turtle() {
278        assert_eq!(
279            RdfFormat::from_path(Path::new("ontology.ttl")),
280            Some(RdfFormat::Turtle)
281        );
282    }
283
284    #[test]
285    fn test_format_detection_rdfxml() {
286        assert_eq!(
287            RdfFormat::from_path(Path::new("ontology.rdf")),
288            Some(RdfFormat::RdfXml)
289        );
290        assert_eq!(
291            RdfFormat::from_path(Path::new("ontology.owl")),
292            Some(RdfFormat::RdfXml)
293        );
294    }
295
296    #[test]
297    fn test_format_detection_unknown() {
298        assert_eq!(RdfFormat::from_path(Path::new("ontology.txt")), None);
299    }
300
301    #[test]
302    fn test_load_missing_file() {
303        let result = load_single_path(Path::new("nonexistent.ttl")).unwrap();
304        assert!(result.is_none());
305    }
306
307    #[test]
308    fn test_parse_simple_turtle() {
309        let ttl = r#"
310            @prefix ex: <http://example.org#> .
311            ex:Car a ex:Vehicle .
312        "#;
313
314        let graph = parse_rdf(ttl, RdfFormat::Turtle).unwrap();
315        assert!(graph.triples().count() > 0);
316    }
317
318    #[test]
319    fn test_parse_simple_rdfxml() {
320        let rdfxml = r#"<?xml version="1.0"?>
321            <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
322                     xmlns:ex="http://example.org#">
323              <rdf:Description rdf:about="http://example.org#Car">
324                <rdf:type rdf:resource="http://example.org#Vehicle"/>
325              </rdf:Description>
326            </rdf:RDF>"#;
327
328        let graph = parse_rdf(rdfxml, RdfFormat::RdfXml).unwrap();
329        assert!(graph.triples().count() > 0);
330    }
331
332    #[test]
333    fn test_parse_simple_jsonld() {
334        let jsonld = r#"{
335            "@context": {
336                "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
337                "ex": "http://example.org#",
338                "type": {"@id": "rdf:type", "@type": "@id"}
339            },
340            "@id": "ex:Car",
341            "type": "ex:Vehicle"
342        }"#;
343
344        let graph = parse_rdf(jsonld, RdfFormat::JsonLd).unwrap();
345        assert!(graph.triples().count() > 0);
346    }
347
348    #[test]
349    fn test_parse_invalid_turtle() {
350        let ttl = "invalid turtle syntax !!!";
351        let result = parse_rdf(ttl, RdfFormat::Turtle);
352        assert!(result.is_err());
353    }
354
355    #[test]
356    fn test_parse_invalid_rdfxml() {
357        let rdfxml = "<rdf:RDF><rdf:Description></rdf:RDF>";
358        let result = parse_rdf(rdfxml, RdfFormat::RdfXml);
359        assert!(result.is_err());
360    }
361
362    #[test]
363    fn test_parse_invalid_jsonld() {
364        let jsonld = "{invalid json-ld}";
365        let result = parse_rdf(jsonld, RdfFormat::JsonLd);
366        assert!(result.is_err());
367    }
368
369    #[test]
370    fn test_parse_empty_turtle() {
371        let ttl = "";
372        let graph = parse_rdf(ttl, RdfFormat::Turtle).unwrap();
373        assert_eq!(graph.triples().count(), 0);
374    }
375}