1use std::collections::HashMap;
2use std::hash::{Hash, Hasher};
3use std::num::NonZeroU32;
4
5use serde::{Deserialize, Serialize};
6
7use crate::error::{Error, Result};
8use crate::limits::Limits;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12pub struct IriId(NonZeroU32);
13
14impl IriId {
15 #[must_use]
21 pub fn from_index(index: u32) -> Self {
22 Self(NonZeroU32::new(index).expect("IriId index must be non-zero"))
23 }
24
25 pub fn try_from_index(index: u32) -> Result<Self> {
27 NonZeroU32::new(index)
28 .map(Self)
29 .ok_or_else(|| Error::InvalidIri("IriId index must be non-zero".into()))
30 }
31
32 #[must_use]
34 pub fn index(self) -> u32 {
35 self.0.get()
36 }
37}
38
39fn hash_iri(s: &str) -> u64 {
40 let mut hasher = std::collections::hash_map::DefaultHasher::new();
41 s.hash(&mut hasher);
42 hasher.finish()
43}
44
45#[derive(Debug, Default, Clone, PartialEq, Eq)]
50pub struct InternPool {
51 strings: Vec<Box<str>>,
52 index: HashMap<u64, Vec<usize>>,
53}
54
55impl InternPool {
56 #[must_use]
58 pub fn new() -> Self {
59 Self::default()
60 }
61
62 #[must_use]
64 pub fn len(&self) -> usize {
65 self.strings.len()
66 }
67
68 #[must_use]
70 pub fn is_empty(&self) -> bool {
71 self.strings.is_empty()
72 }
73
74 #[must_use]
76 pub fn get(&self, iri: &str) -> Option<IriId> {
77 self.lookup(iri)
78 .map(|idx| IriId::from_index((idx + 1) as u32))
79 }
80
81 fn lookup(&self, iri: &str) -> Option<usize> {
82 let bucket = self.index.get(&hash_iri(iri))?;
83 bucket
84 .iter()
85 .copied()
86 .find(|&idx| self.strings[idx].as_ref() == iri)
87 }
88
89 pub fn intern(&mut self, iri: &str) -> Result<IriId> {
91 self.intern_with_limit(iri, Limits::default().max_iri_len)
92 }
93
94 pub fn intern_with_limit(&mut self, iri: &str, max_len: usize) -> Result<IriId> {
96 validate_iri_with_max_len(iri, max_len)?;
97
98 if let Some(idx) = self.lookup(iri) {
99 return Ok(IriId::from_index((idx + 1) as u32));
100 }
101
102 let index = u32::try_from(self.strings.len())
103 .map_err(|_| Error::InvalidIri("IRI pool capacity exceeded".into()))?
104 + 1;
105 let id = IriId(NonZeroU32::new(index).expect("index is non-zero"));
106 let owned: Box<str> = iri.into();
107 let idx = self.strings.len();
108 self.strings.push(owned);
109 self.index.entry(hash_iri(iri)).or_default().push(idx);
110 Ok(id)
111 }
112
113 pub fn resolve(&self, id: IriId) -> Result<&str> {
115 let idx = usize::try_from(id.index())
116 .map_err(|_| Error::InvalidIri(format!("invalid IriId index: {}", id.index())))?
117 - 1;
118 self.strings
119 .get(idx)
120 .map(AsRef::as_ref)
121 .ok_or_else(|| Error::InvalidIri(format!("unknown IriId: {}", id.index())))
122 }
123
124 pub fn iter(&self) -> impl Iterator<Item = &str> {
126 self.strings.iter().map(AsRef::as_ref)
127 }
128}
129
130const ALLOWED_SCHEMES: &[&str] = &["http", "https", "urn", "file", "internal"];
131const SNAPSHOT_ALLOWED_SCHEMES: &[&str] = &["http", "https", "urn"];
132
133pub fn validate_iri(iri: &str) -> Result<()> {
135 validate_iri_with_max_len(iri, Limits::default().max_iri_len)
136}
137
138pub fn validate_iri_with_max_len(iri: &str, max_len: usize) -> Result<()> {
140 validate_iri_schemes(iri, max_len, ALLOWED_SCHEMES)
141}
142
143pub fn validate_snapshot_iri(iri: &str) -> Result<()> {
145 validate_snapshot_iri_with_max_len(iri, Limits::default().max_iri_len)
146}
147
148pub fn validate_snapshot_iri_with_max_len(iri: &str, max_len: usize) -> Result<()> {
150 validate_iri_schemes(iri, max_len, SNAPSHOT_ALLOWED_SCHEMES)
151}
152
153fn validate_iri_schemes(iri: &str, max_len: usize, allowed_schemes: &[&str]) -> Result<()> {
154 if iri.is_empty() {
155 return Err(Error::InvalidIri("IRI must not be empty".into()));
156 }
157
158 if iri.len() > max_len {
159 return Err(Error::InvalidIri(format!(
160 "IRI exceeds maximum length of {max_len} bytes"
161 )));
162 }
163
164 if iri.chars().any(|c| c.is_ascii_control()) {
165 return Err(Error::InvalidIri(format!(
166 "IRI contains control characters: {iri}"
167 )));
168 }
169
170 if iri.contains(' ') {
171 return Err(Error::InvalidIri(format!("IRI contains whitespace: {iri}")));
172 }
173
174 let Some((scheme, rest)) = iri.split_once(':') else {
175 return Err(Error::InvalidIri(format!(
176 "IRI must be absolute (scheme:...): {iri}"
177 )));
178 };
179
180 if scheme.is_empty() || rest.is_empty() {
181 return Err(Error::InvalidIri(format!("invalid IRI scheme: {iri}")));
182 }
183
184 if !scheme
185 .chars()
186 .all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '-' || c == '.')
187 {
188 return Err(Error::InvalidIri(format!("invalid IRI scheme: {iri}")));
189 }
190
191 if !allowed_schemes.contains(&scheme) {
192 return Err(Error::InvalidIri(format!(
193 "IRI scheme '{scheme}' is not allowed (allowed: {})",
194 allowed_schemes.join(", ")
195 )));
196 }
197
198 Ok(())
199}
200
201#[must_use]
206pub fn normalize_iri_fragment_encoding(iri: &str) -> std::borrow::Cow<'_, str> {
207 if iri.contains("%23") {
208 std::borrow::Cow::Owned(iri.replace("%23", "#"))
209 } else {
210 std::borrow::Cow::Borrowed(iri)
211 }
212}
213
214#[cfg(test)]
215mod tests {
216 use super::*;
217
218 #[test]
219 fn normalize_iri_fragment_encoding_collapses_percent23() {
220 let a = "https://example.org/ont.owl#Food";
221 let b = "https://example.org/ont.owl%23Food";
222 assert_eq!(
223 normalize_iri_fragment_encoding(a).as_ref(),
224 normalize_iri_fragment_encoding(b).as_ref()
225 );
226 }
227
228 #[test]
229 fn intern_deduplicates() {
230 let mut pool = InternPool::new();
231 let a = pool.intern("http://example.org/A").expect("intern");
232 let b = pool.intern("http://example.org/A").expect("intern");
233 let c = pool.intern("http://example.org/B").expect("intern");
234 assert_eq!(a, b);
235 assert_ne!(a, c);
236 assert_eq!(pool.len(), 2);
237 }
238
239 #[test]
240 fn resolve_round_trip() {
241 let mut pool = InternPool::new();
242 let id = pool.intern("http://example.org/Test").expect("intern");
243 assert_eq!(
244 pool.resolve(id).expect("resolve"),
245 "http://example.org/Test"
246 );
247 }
248
249 #[test]
250 fn rejects_relative_iri() {
251 let mut pool = InternPool::new();
252 assert!(pool.intern("relative/path").is_err());
253 }
254
255 #[test]
256 fn accepts_urn() {
257 let mut pool = InternPool::new();
258 let id = pool.intern("urn:example:animal").expect("intern");
259 assert_eq!(pool.resolve(id).expect("resolve"), "urn:example:animal");
260 }
261
262 #[test]
263 fn rejects_empty_iri() {
264 assert!(validate_iri("").is_err());
265 }
266
267 #[test]
268 fn rejects_whitespace_iri() {
269 assert!(validate_iri("http://example.org/a b").is_err());
270 }
271
272 #[test]
273 fn rejects_javascript_scheme() {
274 assert!(validate_iri("javascript:alert(1)").is_err());
275 }
276
277 #[test]
278 fn snapshot_iri_rejects_file_scheme() {
279 assert!(validate_snapshot_iri("file:///etc/passwd").is_err());
280 assert!(validate_snapshot_iri("https://example.org/C").is_ok());
281 }
282
283 #[test]
284 fn rejects_control_characters() {
285 assert!(validate_iri("http://example.org/\u{0009}").is_err());
286 }
287
288 #[test]
289 fn try_from_index_rejects_zero() {
290 assert!(IriId::try_from_index(0).is_err());
291 }
292
293 #[test]
294 fn get_returns_none_for_unknown() {
295 let pool = InternPool::new();
296 assert!(pool.get("http://example.org/missing").is_none());
297 }
298
299 #[test]
300 fn resolve_unknown_id_errors() {
301 let pool = InternPool::new();
302 let err = pool.resolve(IriId::from_index(1)).expect_err("unknown id");
303 assert!(matches!(err, Error::InvalidIri(_)));
304 }
305
306 #[test]
307 fn single_storage_no_duplicate_bytes() {
308 let mut pool = InternPool::new();
309 pool.intern("http://example.org/A").expect("intern");
310 assert_eq!(pool.strings.len(), 1);
312 assert_eq!(pool.index.values().map(|v| v.len()).sum::<usize>(), 1);
313 }
314}