Skip to main content

ontologos_core/
iri.rs

1use std::collections::HashMap;
2use std::hash::{Hash, Hasher};
3use std::num::NonZeroU32;
4
5use serde::{Deserialize, Serialize};
6
7use crate::error::{Error, Result};
8use crate::limits::Limits;
9
10/// Stable identifier for an interned IRI string.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12pub struct IriId(NonZeroU32);
13
14impl IriId {
15    /// Construct an `IriId` from a one-based index.
16    ///
17    /// # Panics
18    ///
19    /// Panics if `index` is zero. Prefer [`try_from_index`](Self::try_from_index) on untrusted input.
20    #[must_use]
21    pub fn from_index(index: u32) -> Self {
22        Self(NonZeroU32::new(index).expect("IriId index must be non-zero"))
23    }
24
25    /// Fallible constructor for untrusted indices.
26    pub fn try_from_index(index: u32) -> Result<Self> {
27        NonZeroU32::new(index)
28            .map(Self)
29            .ok_or_else(|| Error::InvalidIri("IriId index must be non-zero".into()))
30    }
31
32    /// One-based index into the intern pool.
33    #[must_use]
34    pub fn index(self) -> u32 {
35        self.0.get()
36    }
37}
38
39fn hash_iri(s: &str) -> u64 {
40    let mut hasher = std::collections::hash_map::DefaultHasher::new();
41    s.hash(&mut hasher);
42    hasher.finish()
43}
44
45/// Deduplicating pool of absolute IRI strings.
46///
47/// Each IRI is stored once in `strings`; `index` maps hash buckets to string indices
48/// (collision chains compare full string equality).
49#[derive(Debug, Default, Clone, PartialEq, Eq)]
50pub struct InternPool {
51    strings: Vec<Box<str>>,
52    index: HashMap<u64, Vec<usize>>,
53}
54
55impl InternPool {
56    /// Create an empty intern pool.
57    #[must_use]
58    pub fn new() -> Self {
59        Self::default()
60    }
61
62    /// Number of unique IRIs in the pool.
63    #[must_use]
64    pub fn len(&self) -> usize {
65        self.strings.len()
66    }
67
68    /// Returns `true` if the pool contains no IRIs.
69    #[must_use]
70    pub fn is_empty(&self) -> bool {
71        self.strings.is_empty()
72    }
73
74    /// Look up an already-interned IRI without inserting.
75    #[must_use]
76    pub fn get(&self, iri: &str) -> Option<IriId> {
77        self.lookup(iri)
78            .map(|idx| IriId::from_index((idx + 1) as u32))
79    }
80
81    fn lookup(&self, iri: &str) -> Option<usize> {
82        let bucket = self.index.get(&hash_iri(iri))?;
83        bucket
84            .iter()
85            .copied()
86            .find(|&idx| self.strings[idx].as_ref() == iri)
87    }
88
89    /// Intern an absolute IRI, returning an existing id if already present.
90    pub fn intern(&mut self, iri: &str) -> Result<IriId> {
91        self.intern_with_limit(iri, Limits::default().max_iri_len)
92    }
93
94    /// Intern an IRI with a custom maximum length.
95    pub fn intern_with_limit(&mut self, iri: &str, max_len: usize) -> Result<IriId> {
96        validate_iri_with_max_len(iri, max_len)?;
97
98        if let Some(idx) = self.lookup(iri) {
99            return Ok(IriId::from_index((idx + 1) as u32));
100        }
101
102        let index = u32::try_from(self.strings.len())
103            .map_err(|_| Error::InvalidIri("IRI pool capacity exceeded".into()))?
104            + 1;
105        let id = IriId(NonZeroU32::new(index).expect("index is non-zero"));
106        let owned: Box<str> = iri.into();
107        let idx = self.strings.len();
108        self.strings.push(owned);
109        self.index.entry(hash_iri(iri)).or_default().push(idx);
110        Ok(id)
111    }
112
113    /// Resolve an interned IRI to its string value.
114    pub fn resolve(&self, id: IriId) -> Result<&str> {
115        let idx = usize::try_from(id.index())
116            .map_err(|_| Error::InvalidIri(format!("invalid IriId index: {}", id.index())))?
117            - 1;
118        self.strings
119            .get(idx)
120            .map(AsRef::as_ref)
121            .ok_or_else(|| Error::InvalidIri(format!("unknown IriId: {}", id.index())))
122    }
123
124    /// Iterate over all interned IRI strings in insertion order.
125    pub fn iter(&self) -> impl Iterator<Item = &str> {
126        self.strings.iter().map(AsRef::as_ref)
127    }
128}
129
130const ALLOWED_SCHEMES: &[&str] = &["http", "https", "urn", "file", "internal"];
131const SNAPSHOT_ALLOWED_SCHEMES: &[&str] = &["http", "https", "urn"];
132
133/// Validate that `iri` is an absolute IRI with an allowed scheme.
134pub fn validate_iri(iri: &str) -> Result<()> {
135    validate_iri_with_max_len(iri, Limits::default().max_iri_len)
136}
137
138/// Validate an IRI with a custom maximum length.
139pub fn validate_iri_with_max_len(iri: &str, max_len: usize) -> Result<()> {
140    validate_iri_schemes(iri, max_len, ALLOWED_SCHEMES)
141}
142
143/// Validate an IRI from an untrusted JSON snapshot (`http`, `https`, `urn` only).
144pub fn validate_snapshot_iri(iri: &str) -> Result<()> {
145    validate_snapshot_iri_with_max_len(iri, Limits::default().max_iri_len)
146}
147
148/// Validate a snapshot IRI with a custom maximum length.
149pub fn validate_snapshot_iri_with_max_len(iri: &str, max_len: usize) -> Result<()> {
150    validate_iri_schemes(iri, max_len, SNAPSHOT_ALLOWED_SCHEMES)
151}
152
153fn validate_iri_schemes(iri: &str, max_len: usize, allowed_schemes: &[&str]) -> Result<()> {
154    if iri.is_empty() {
155        return Err(Error::InvalidIri("IRI must not be empty".into()));
156    }
157
158    if iri.len() > max_len {
159        return Err(Error::InvalidIri(format!(
160            "IRI exceeds maximum length of {max_len} bytes"
161        )));
162    }
163
164    if iri.chars().any(|c| c.is_ascii_control()) {
165        return Err(Error::InvalidIri(format!(
166            "IRI contains control characters: {iri}"
167        )));
168    }
169
170    if iri.contains(' ') {
171        return Err(Error::InvalidIri(format!("IRI contains whitespace: {iri}")));
172    }
173
174    let Some((scheme, rest)) = iri.split_once(':') else {
175        return Err(Error::InvalidIri(format!(
176            "IRI must be absolute (scheme:...): {iri}"
177        )));
178    };
179
180    if scheme.is_empty() || rest.is_empty() {
181        return Err(Error::InvalidIri(format!("invalid IRI scheme: {iri}")));
182    }
183
184    if !scheme
185        .chars()
186        .all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.')
187    {
188        return Err(Error::InvalidIri(format!("invalid IRI scheme: {iri}")));
189    }
190
191    if !allowed_schemes.contains(&scheme) {
192        return Err(Error::InvalidIri(format!(
193            "IRI scheme '{scheme}' is not allowed (allowed: {})",
194            allowed_schemes.join(", ")
195        )));
196    }
197
198    Ok(())
199}
200
201/// Normalize `%23` to `#` (RDF/XML fragment encoding artifact).
202///
203/// OWL/RDF loaders may emit the same logical IRI as `.../owl#Food` and `.../owl%23Food`.
204/// Entity registration collapses these to a single interned IRI.
205#[must_use]
206pub fn normalize_iri_fragment_encoding(iri: &str) -> std::borrow::Cow<'_, str> {
207    if iri.contains("%23") {
208        std::borrow::Cow::Owned(iri.replace("%23", "#"))
209    } else {
210        std::borrow::Cow::Borrowed(iri)
211    }
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    #[test]
219    fn normalize_iri_fragment_encoding_collapses_percent23() {
220        let a = "https://example.org/ont.owl#Food";
221        let b = "https://example.org/ont.owl%23Food";
222        assert_eq!(
223            normalize_iri_fragment_encoding(a).as_ref(),
224            normalize_iri_fragment_encoding(b).as_ref()
225        );
226    }
227
228    #[test]
229    fn intern_deduplicates() {
230        let mut pool = InternPool::new();
231        let a = pool.intern("http://example.org/A").expect("intern");
232        let b = pool.intern("http://example.org/A").expect("intern");
233        let c = pool.intern("http://example.org/B").expect("intern");
234        assert_eq!(a, b);
235        assert_ne!(a, c);
236        assert_eq!(pool.len(), 2);
237    }
238
239    #[test]
240    fn resolve_round_trip() {
241        let mut pool = InternPool::new();
242        let id = pool.intern("http://example.org/Test").expect("intern");
243        assert_eq!(
244            pool.resolve(id).expect("resolve"),
245            "http://example.org/Test"
246        );
247    }
248
249    #[test]
250    fn rejects_relative_iri() {
251        let mut pool = InternPool::new();
252        assert!(pool.intern("relative/path").is_err());
253    }
254
255    #[test]
256    fn accepts_urn() {
257        let mut pool = InternPool::new();
258        let id = pool.intern("urn:example:animal").expect("intern");
259        assert_eq!(pool.resolve(id).expect("resolve"), "urn:example:animal");
260    }
261
262    #[test]
263    fn rejects_empty_iri() {
264        assert!(validate_iri("").is_err());
265    }
266
267    #[test]
268    fn rejects_whitespace_iri() {
269        assert!(validate_iri("http://example.org/a b").is_err());
270    }
271
272    #[test]
273    fn rejects_javascript_scheme() {
274        assert!(validate_iri("javascript:alert(1)").is_err());
275    }
276
277    #[test]
278    fn snapshot_iri_rejects_file_scheme() {
279        assert!(validate_snapshot_iri("file:///etc/passwd").is_err());
280        assert!(validate_snapshot_iri("https://example.org/C").is_ok());
281    }
282
283    #[test]
284    fn rejects_control_characters() {
285        assert!(validate_iri("http://example.org/\u{0009}").is_err());
286    }
287
288    #[test]
289    fn try_from_index_rejects_zero() {
290        assert!(IriId::try_from_index(0).is_err());
291    }
292
293    #[test]
294    fn get_returns_none_for_unknown() {
295        let pool = InternPool::new();
296        assert!(pool.get("http://example.org/missing").is_none());
297    }
298
299    #[test]
300    fn resolve_unknown_id_errors() {
301        let pool = InternPool::new();
302        let err = pool.resolve(IriId::from_index(1)).expect_err("unknown id");
303        assert!(matches!(err, Error::InvalidIri(_)));
304    }
305
306    #[test]
307    fn single_storage_no_duplicate_bytes() {
308        let mut pool = InternPool::new();
309        pool.intern("http://example.org/A").expect("intern");
310        // One string in vec; index holds only usize indices, not duplicate strings.
311        assert_eq!(pool.strings.len(), 1);
312        assert_eq!(pool.index.values().map(|v| v.len()).sum::<usize>(), 1);
313    }
314}