Skip to main content

ontologos_core/
iri.rs

1use std::collections::HashMap;
2use std::num::NonZeroU32;
3
4use serde::{Deserialize, Serialize};
5
6use crate::error::{Error, Result};
7use crate::limits::Limits;
8
9/// Stable identifier for an interned IRI string.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
11pub struct IriId(NonZeroU32);
12
13impl IriId {
14    /// Construct an `IriId` from a one-based index.
15    ///
16    /// # Panics
17    ///
18    /// Panics if `index` is zero. Prefer [`try_from_index`](Self::try_from_index) on untrusted input.
19    #[must_use]
20    pub fn from_index(index: u32) -> Self {
21        Self(NonZeroU32::new(index).expect("IriId index must be non-zero"))
22    }
23
24    /// Fallible constructor for untrusted indices.
25    pub fn try_from_index(index: u32) -> Result<Self> {
26        NonZeroU32::new(index)
27            .map(Self)
28            .ok_or_else(|| Error::InvalidIri("IriId index must be non-zero".into()))
29    }
30
31    /// One-based index into the intern pool.
32    #[must_use]
33    pub fn index(self) -> u32 {
34        self.0.get()
35    }
36}
37
38/// Deduplicating pool of absolute IRI strings.
39#[derive(Debug, Default, Clone, PartialEq, Eq)]
40pub struct InternPool {
41    strings: Vec<Box<str>>,
42    index: HashMap<Box<str>, IriId>,
43}
44
45impl InternPool {
46    /// Create an empty intern pool.
47    #[must_use]
48    pub fn new() -> Self {
49        Self::default()
50    }
51
52    /// Number of unique IRIs in the pool.
53    #[must_use]
54    pub fn len(&self) -> usize {
55        self.strings.len()
56    }
57
58    /// Returns `true` if the pool contains no IRIs.
59    #[must_use]
60    pub fn is_empty(&self) -> bool {
61        self.strings.is_empty()
62    }
63
64    /// Look up an already-interned IRI without inserting.
65    #[must_use]
66    pub fn get(&self, iri: &str) -> Option<IriId> {
67        self.index.get(iri).copied()
68    }
69
70    /// Intern an absolute IRI, returning an existing id if already present.
71    pub fn intern(&mut self, iri: &str) -> Result<IriId> {
72        self.intern_with_limit(iri, Limits::default().max_iri_len)
73    }
74
75    /// Intern an IRI with a custom maximum length.
76    pub fn intern_with_limit(&mut self, iri: &str, max_len: usize) -> Result<IriId> {
77        validate_iri_with_max_len(iri, max_len)?;
78
79        if let Some(&id) = self.index.get(iri) {
80            return Ok(id);
81        }
82
83        let index = u32::try_from(self.strings.len())
84            .map_err(|_| Error::InvalidIri("IRI pool capacity exceeded".into()))?
85            + 1;
86        let id = IriId(NonZeroU32::new(index).expect("index is non-zero"));
87        let owned: Box<str> = iri.into();
88        self.index.insert(owned.clone(), id);
89        self.strings.push(owned);
90        Ok(id)
91    }
92
93    /// Resolve an interned IRI to its string value.
94    pub fn resolve(&self, id: IriId) -> Result<&str> {
95        let idx = usize::try_from(id.index())
96            .map_err(|_| Error::InvalidIri(format!("invalid IriId index: {}", id.index())))?
97            - 1;
98        self.strings
99            .get(idx)
100            .map(AsRef::as_ref)
101            .ok_or_else(|| Error::InvalidIri(format!("unknown IriId: {}", id.index())))
102    }
103
104    /// Iterate over all interned IRI strings in insertion order.
105    pub fn iter(&self) -> impl Iterator<Item = &str> {
106        self.strings.iter().map(AsRef::as_ref)
107    }
108}
109
110const ALLOWED_SCHEMES: &[&str] = &["http", "https", "urn"];
111
112/// Validate that `iri` is an absolute IRI with an allowed scheme.
113pub fn validate_iri(iri: &str) -> Result<()> {
114    validate_iri_with_max_len(iri, Limits::default().max_iri_len)
115}
116
117/// Validate an IRI with a custom maximum length.
118pub fn validate_iri_with_max_len(iri: &str, max_len: usize) -> Result<()> {
119    if iri.is_empty() {
120        return Err(Error::InvalidIri("IRI must not be empty".into()));
121    }
122
123    if iri.len() > max_len {
124        return Err(Error::InvalidIri(format!(
125            "IRI exceeds maximum length of {max_len} bytes"
126        )));
127    }
128
129    if iri.chars().any(|c| c.is_ascii_control()) {
130        return Err(Error::InvalidIri(format!(
131            "IRI contains control characters: {iri}"
132        )));
133    }
134
135    if iri.contains(' ') {
136        return Err(Error::InvalidIri(format!("IRI contains whitespace: {iri}")));
137    }
138
139    let Some((scheme, rest)) = iri.split_once(':') else {
140        return Err(Error::InvalidIri(format!(
141            "IRI must be absolute (scheme:...): {iri}"
142        )));
143    };
144
145    if scheme.is_empty() || rest.is_empty() {
146        return Err(Error::InvalidIri(format!("invalid IRI scheme: {iri}")));
147    }
148
149    if !scheme
150        .chars()
151        .all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.')
152    {
153        return Err(Error::InvalidIri(format!("invalid IRI scheme: {iri}")));
154    }
155
156    if !ALLOWED_SCHEMES.contains(&scheme) {
157        return Err(Error::InvalidIri(format!(
158            "IRI scheme '{scheme}' is not allowed (allowed: http, https, urn)"
159        )));
160    }
161
162    Ok(())
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn intern_deduplicates() {
171        let mut pool = InternPool::new();
172        let a = pool.intern("http://example.org/A").expect("intern");
173        let b = pool.intern("http://example.org/A").expect("intern");
174        let c = pool.intern("http://example.org/B").expect("intern");
175        assert_eq!(a, b);
176        assert_ne!(a, c);
177        assert_eq!(pool.len(), 2);
178    }
179
180    #[test]
181    fn resolve_round_trip() {
182        let mut pool = InternPool::new();
183        let id = pool.intern("http://example.org/Test").expect("intern");
184        assert_eq!(
185            pool.resolve(id).expect("resolve"),
186            "http://example.org/Test"
187        );
188    }
189
190    #[test]
191    fn rejects_relative_iri() {
192        let mut pool = InternPool::new();
193        assert!(pool.intern("relative/path").is_err());
194    }
195
196    #[test]
197    fn accepts_urn() {
198        let mut pool = InternPool::new();
199        let id = pool.intern("urn:example:animal").expect("intern");
200        assert_eq!(pool.resolve(id).expect("resolve"), "urn:example:animal");
201    }
202
203    #[test]
204    fn rejects_empty_iri() {
205        assert!(validate_iri("").is_err());
206    }
207
208    #[test]
209    fn rejects_whitespace_iri() {
210        assert!(validate_iri("http://example.org/a b").is_err());
211    }
212
213    #[test]
214    fn rejects_javascript_scheme() {
215        assert!(validate_iri("javascript:alert(1)").is_err());
216    }
217
218    #[test]
219    fn rejects_control_characters() {
220        assert!(validate_iri("http://example.org/\u{0009}").is_err());
221    }
222
223    #[test]
224    fn try_from_index_rejects_zero() {
225        assert!(IriId::try_from_index(0).is_err());
226    }
227
228    #[test]
229    fn get_returns_none_for_unknown() {
230        let pool = InternPool::new();
231        assert!(pool.get("http://example.org/missing").is_none());
232    }
233
234    #[test]
235    fn resolve_unknown_id_errors() {
236        let pool = InternPool::new();
237        let err = pool.resolve(IriId::from_index(1)).expect_err("unknown id");
238        assert!(matches!(err, Error::InvalidIri(_)));
239    }
240}