Skip to main content

ontologos_parser/
load.rs

1use std::fs::{File, OpenOptions};
2use std::io::{Read, Seek, SeekFrom};
3use std::path::{Component, Path, PathBuf};
4
5use ontologos_core::Ontology;
6
7use crate::limits::ParseLimits;
8use crate::map::map_to_core;
9use crate::read::{read_horned_owl_from_reader, sniff_and_rewind};
10use crate::{
11    detect_format, detect_format_from_bytes, detect_functional_from_bytes,
12    detect_turtle_from_bytes, Error, Format, Result,
13};
14
15#[cfg(target_os = "linux")]
16const O_NOFOLLOW: i32 = 0o100_000;
17#[cfg(target_os = "macos")]
18const O_NOFOLLOW: i32 = 0x0000_0040;
19#[cfg(all(unix, not(any(target_os = "linux", target_os = "macos"))))]
20const O_NOFOLLOW: i32 = 0;
21
22/// Resolve and validate a path before loading an ontology file.
23pub fn validate_load_path(path: &Path, base: Option<&Path>) -> Result<PathBuf> {
24    let normalized = normalize_path(path)?;
25
26    if let Some(base) = base {
27        let base_normalized = normalize_path(base)?;
28        if !path_is_under_base(&normalized, &base_normalized) {
29            return Err(Error::Parse(format!(
30                "path {} escapes allowed base {}",
31                normalized.display(),
32                base_normalized.display()
33            )));
34        }
35    }
36
37    Ok(normalized)
38}
39
40/// Load an ontology from a validated file path.
41pub fn load_ontology(path: &Path) -> Result<Ontology> {
42    load_ontology_with_limits(path, ParseLimits::default())
43}
44
45/// Load an ontology constrained to stay under `base` (untrusted uploads).
46pub fn load_ontology_in(base: &Path, path: &Path) -> Result<Ontology> {
47    load_ontology_with_limits_and_base(path, ParseLimits::default(), Some(base))
48}
49
50/// Load an ontology with custom [`ParseLimits`].
51pub fn load_ontology_with_limits(path: &Path, limits: ParseLimits) -> Result<Ontology> {
52    load_ontology_with_limits_and_base(path, limits, None)
53}
54
55/// Load an ontology with custom limits and optional sandbox base directory.
56pub fn load_ontology_with_limits_and_base(
57    path: &Path,
58    limits: ParseLimits,
59    base: Option<&Path>,
60) -> Result<Ontology> {
61    let validated = validate_load_path(path, base)?;
62    if !validated.is_file() {
63        return Err(Error::Parse(format!("not a file: {}", validated.display())));
64    }
65
66    let mut file = open_for_load(&validated, base)?;
67    let file_len = file
68        .metadata()
69        .map_err(|e| Error::Parse(e.to_string()))?
70        .len();
71    if file_len as usize > limits.max_file_bytes {
72        return Err(Error::Parse(format!(
73            "file size {file_len} exceeds limit of {} bytes",
74            limits.max_file_bytes
75        )));
76    }
77    let format = detect_format_with_sniff(path, &mut file)?;
78    let set_ontology = read_horned_owl_from_reader(&mut file, format, limits)?;
79    let (mut ontology, report) = map_to_core(&set_ontology, limits)?;
80    ontology.set_parse_meta(report.into_meta());
81    Ok(ontology)
82}
83
84fn open_for_load(path: &Path, base: Option<&Path>) -> Result<File> {
85    let pre_meta = std::fs::symlink_metadata(path).map_err(|e| Error::Parse(e.to_string()))?;
86    let file = open_readonly_nofollow(path)?;
87    if let Some(base) = base {
88        verify_opened_under_base(&file, base, path, &pre_meta)?;
89    }
90    Ok(file)
91}
92
93fn open_readonly_nofollow(path: &Path) -> Result<File> {
94    #[cfg(unix)]
95    {
96        use std::os::unix::fs::OpenOptionsExt;
97        OpenOptions::new()
98            .read(true)
99            .custom_flags(O_NOFOLLOW)
100            .open(path)
101            .map_err(|e| Error::Parse(e.to_string()))
102    }
103    #[cfg(not(unix))]
104    {
105        File::open(path).map_err(|e| Error::Parse(e.to_string()))
106    }
107}
108
109fn verify_opened_under_base(
110    file: &File,
111    base: &Path,
112    validated: &Path,
113    pre_meta: &std::fs::Metadata,
114) -> Result<()> {
115    #[cfg(unix)]
116    use std::os::unix::fs::MetadataExt;
117
118    let file_meta = file.metadata().map_err(|e| Error::Parse(e.to_string()))?;
119    #[cfg(unix)]
120    if pre_meta.dev() != file_meta.dev() || pre_meta.ino() != file_meta.ino() {
121        return Err(Error::Parse(
122            "ontology path changed between validation and open".into(),
123        ));
124    }
125    #[cfg(not(unix))]
126    let _ = (pre_meta, file_meta);
127
128    let base_normalized = normalize_path(base)?;
129    let base_canon = base_normalized
130        .canonicalize()
131        .map_err(|e| Error::Parse(e.to_string()))?;
132
133    if let Ok(opened) = opened_path(file) {
134        let opened_canon = opened
135            .canonicalize()
136            .map_err(|e| Error::Parse(e.to_string()))?;
137        if !path_is_under_base(&opened_canon, &base_canon) {
138            return Err(Error::Parse(format!(
139                "opened file {} escapes allowed base {}",
140                opened_canon.display(),
141                base_canon.display()
142            )));
143        }
144        return Ok(());
145    }
146
147    let validated_canon = validated
148        .canonicalize()
149        .map_err(|e| Error::Parse(e.to_string()))?;
150    if !path_is_under_base(&validated_canon, &base_canon) {
151        return Err(Error::Parse(format!(
152            "path {} escapes allowed base {}",
153            validated_canon.display(),
154            base_canon.display()
155        )));
156    }
157    Ok(())
158}
159
160#[cfg(target_os = "linux")]
161fn opened_path(file: &File) -> Result<PathBuf> {
162    use std::os::unix::io::AsRawFd;
163    let fd = file.as_raw_fd();
164    std::fs::read_link(format!("/proc/self/fd/{fd}")).map_err(|e| Error::Parse(e.to_string()))
165}
166
167#[cfg(target_os = "macos")]
168fn opened_path(file: &File) -> Result<PathBuf> {
169    use std::ffi::CStr;
170    use std::os::unix::io::AsRawFd;
171
172    const F_GETPATH: i32 = 50;
173    let fd = file.as_raw_fd();
174    let mut buf = [0u8; 1024];
175    let rc = unsafe { libc::fcntl(fd, F_GETPATH, buf.as_mut_ptr()) };
176    if rc == -1 {
177        return Err(Error::Parse("fcntl(F_GETPATH) failed".into()));
178    }
179    let cstr = CStr::from_bytes_until_nul(&buf).map_err(|e| Error::Parse(e.to_string()))?;
180    Ok(PathBuf::from(cstr.to_string_lossy().into_owned()))
181}
182
183#[cfg(not(any(target_os = "linux", target_os = "macos")))]
184fn opened_path(_file: &File) -> Result<PathBuf> {
185    Err(Error::Parse("fd path resolution unavailable".into()))
186}
187
188fn detect_format_with_sniff(path: &Path, reader: &mut (impl Read + Seek)) -> Result<Format> {
189    if let Some(format) = detect_format(path) {
190        reader
191            .seek(SeekFrom::Start(0))
192            .map_err(|e| Error::Parse(e.to_string()))?;
193        return Ok(format);
194    }
195
196    let header = sniff_and_rewind(reader, 4096)?;
197    if let Some(format) = detect_format_from_bytes(&header) {
198        return Ok(format);
199    }
200    if detect_turtle_from_bytes(&header) {
201        return Ok(Format::Turtle);
202    }
203    if detect_functional_from_bytes(&header) {
204        return Ok(Format::Functional);
205    }
206
207    Err(Error::UnsupportedFormat(format!(
208        "could not detect OWL/RDF format for {}",
209        path.display()
210    )))
211}
212
213fn normalize_path(path: &Path) -> Result<PathBuf> {
214    let base = if path.is_absolute() {
215        PathBuf::new()
216    } else {
217        std::env::current_dir().map_err(|e| Error::Parse(e.to_string()))?
218    };
219
220    let mut normalized = base;
221    for component in path.components() {
222        match component {
223            Component::Prefix(_) | Component::RootDir => normalized.push(component.as_os_str()),
224            Component::CurDir => {}
225            Component::ParentDir => {
226                if !normalized.pop() {
227                    return Err(Error::Parse("path escapes beyond filesystem root".into()));
228                }
229            }
230            Component::Normal(part) => normalized.push(part),
231        }
232    }
233
234    if normalized.exists() {
235        normalized = normalized
236            .canonicalize()
237            .map_err(|e| Error::Parse(e.to_string()))?;
238    }
239
240    Ok(normalized)
241}
242
243/// True when `path` is the same as or nested under `base` (path-component wise).
244fn path_is_under_base(path: &Path, base: &Path) -> bool {
245    let mut path_iter = path.components();
246    for base_comp in base.components() {
247        match path_iter.next() {
248            Some(path_comp) if path_comp == base_comp => {}
249            _ => return false,
250        }
251    }
252    true
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258    use std::path::Path;
259
260    #[test]
261    fn rejects_path_traversal_outside_base() {
262        let base = std::env::current_dir().expect("cwd");
263        let err = validate_load_path(Path::new("../../../etc/passwd"), Some(&base))
264            .expect_err("traversal");
265        assert!(matches!(err, Error::Parse(_)));
266    }
267
268    #[test]
269    fn rejects_path_prefix_bypass() {
270        let parent = std::env::temp_dir();
271        let base = parent.join("ontologos_uploads_base");
272        let evil = parent.join("ontologos_uploads_base_evil");
273        std::fs::create_dir_all(&base).expect("create base");
274        std::fs::create_dir_all(&evil).expect("create evil sibling");
275        let file = evil.join("secret.owl");
276        std::fs::write(&file, b"<rdf:RDF/>").expect("write file");
277
278        let err = validate_load_path(&file, Some(&base)).expect_err("prefix bypass");
279        assert!(matches!(err, Error::Parse(_)));
280
281        let _ = std::fs::remove_file(&file);
282        let _ = std::fs::remove_dir(&evil);
283        let _ = std::fs::remove_dir(&base);
284    }
285
286    #[test]
287    fn path_is_under_base_accepts_nested_file() {
288        let parent = std::env::temp_dir();
289        let base = parent.join("ontologos_nested_base");
290        let nested = base.join("nested");
291        std::fs::create_dir_all(&nested).expect("create nested");
292        let file = nested.join("ontology.owl");
293        std::fs::write(&file, b"<rdf:RDF/>").expect("write file");
294
295        let validated = validate_load_path(&file, Some(&base)).expect("nested file under base");
296        assert!(path_is_under_base(
297            &validated,
298            &base.canonicalize().expect("canonicalize base")
299        ));
300
301        let _ = std::fs::remove_file(&file);
302        let _ = std::fs::remove_dir(&nested);
303        let _ = std::fs::remove_dir(&base);
304    }
305
306    #[cfg(unix)]
307    #[test]
308    fn sandboxed_load_does_not_follow_symlink_to_outside_file() {
309        use std::os::unix::fs::symlink;
310
311        let parent = std::env::temp_dir();
312        let base = parent.join("ontologos_sandbox_base");
313        let outside = parent.join("ontologos_outside_secret.owl");
314        let link = base.join("ontology.owl");
315        std::fs::create_dir_all(&base).expect("create base");
316        std::fs::write(&outside, b"OUTSIDE_SECRET_CONTENT").expect("write outside");
317
318        symlink(&outside, &link).expect("symlink");
319
320        let err = load_ontology_in(&base, &link).expect_err("symlink escape");
321        assert!(matches!(err, Error::Parse(_) | Error::UnsupportedFormat(_)));
322
323        let _ = std::fs::remove_file(&link);
324        let _ = std::fs::remove_file(&outside);
325        let _ = std::fs::remove_dir(&base);
326    }
327}