1use crate::model::{BlankNode, Literal, NamedNode};
5use siphasher::sip128::{Hasher128, SipHasher24};
6use std::collections::HashMap;
7use std::hash::{Hash, Hasher};
8use std::sync::{Arc, RwLock};
9
10#[derive(Eq, PartialEq, Debug, Clone, Copy)]
12pub struct OxiStrHash {
13 hash: [u8; 16],
14}
15
16impl OxiStrHash {
17 pub fn new(value: &str) -> Self {
18 let mut hasher = SipHasher24::new();
19 hasher.write(value.as_bytes());
20 Self {
21 hash: u128::from(hasher.finish128()).to_be_bytes(),
22 }
23 }
24
25 #[inline]
26 pub fn from_be_bytes(hash: [u8; 16]) -> Self {
27 Self { hash }
28 }
29
30 #[inline]
31 pub fn to_be_bytes(self) -> [u8; 16] {
32 self.hash
33 }
34}
35
36impl Hash for OxiStrHash {
37 #[inline]
38 fn hash<H: Hasher>(&self, state: &mut H) {
39 state.write_u128(u128::from_ne_bytes(self.hash))
40 }
41}
42
43#[derive(Debug, Clone, PartialEq)]
45pub enum OxiEncodedTerm {
46 DefaultGraph,
47 NamedNode {
48 iri: OxiStrHash,
49 },
50 BlankNode {
51 id: OxiStrHash,
52 },
53 Literal {
54 value: OxiStrHash,
55 datatype: Option<OxiStrHash>,
56 language: Option<String>,
57 },
58 BooleanLiteral(bool),
60 IntegerLiteral(i64),
61 FloatLiteral(f32),
62 DoubleLiteral(f64),
63 StringLiteral(OxiStrHash),
64}
65
66impl Hash for OxiEncodedTerm {
67 fn hash<H: Hasher>(&self, state: &mut H) {
68 match self {
69 OxiEncodedTerm::DefaultGraph => {
70 0u8.hash(state);
71 }
72 OxiEncodedTerm::NamedNode { iri } => {
73 1u8.hash(state);
74 iri.hash(state);
75 }
76 OxiEncodedTerm::BlankNode { id } => {
77 2u8.hash(state);
78 id.hash(state);
79 }
80 OxiEncodedTerm::Literal {
81 value,
82 datatype,
83 language,
84 } => {
85 3u8.hash(state);
86 value.hash(state);
87 datatype.hash(state);
88 language.hash(state);
89 }
90 OxiEncodedTerm::BooleanLiteral(value) => {
91 4u8.hash(state);
92 value.hash(state);
93 }
94 OxiEncodedTerm::IntegerLiteral(value) => {
95 5u8.hash(state);
96 value.hash(state);
97 }
98 OxiEncodedTerm::FloatLiteral(value) => {
99 6u8.hash(state);
100 value.to_bits().hash(state);
102 }
103 OxiEncodedTerm::DoubleLiteral(value) => {
104 7u8.hash(state);
105 value.to_bits().hash(state);
107 }
108 OxiEncodedTerm::StringLiteral(value) => {
109 8u8.hash(state);
110 value.hash(state);
111 }
112 }
113 }
114}
115
116impl Eq for OxiEncodedTerm {}
117
118#[derive(Debug, Default)]
120pub struct StringInterner {
121 string_storage: HashMap<OxiStrHash, String>,
123 total_strings: usize,
125 total_deduplication_saves: usize,
126}
127
128impl StringInterner {
129 pub fn new() -> Self {
130 Self::default()
131 }
132
133 pub fn intern(&mut self, value: &str) -> OxiStrHash {
135 let hash = OxiStrHash::new(value);
136
137 if let std::collections::hash_map::Entry::Vacant(e) = self.string_storage.entry(hash) {
138 e.insert(value.to_string());
139 self.total_strings += 1;
140 } else {
141 self.total_deduplication_saves += 1;
142 }
143
144 hash
145 }
146
147 pub fn resolve(&self, hash: &OxiStrHash) -> Option<&str> {
149 self.string_storage.get(hash).map(|s| s.as_str())
150 }
151
152 pub fn stats(&self) -> InternerStats {
154 InternerStats {
155 total_strings: self.total_strings,
156 deduplication_saves: self.total_deduplication_saves,
157 memory_usage: self.string_storage.values().map(|s| s.len()).sum(),
158 }
159 }
160}
161
162#[derive(Debug, Clone)]
163pub struct InternerStats {
164 pub total_strings: usize,
165 pub deduplication_saves: usize,
166 pub memory_usage: usize,
167}
168
169pub struct OptimizedTermEncoder {
171 interner: Arc<RwLock<StringInterner>>,
172}
173
174impl OptimizedTermEncoder {
175 pub fn new() -> Self {
176 Self {
177 interner: Arc::new(RwLock::new(StringInterner::new())),
178 }
179 }
180
181 pub fn encode_named_node(&self, node: &NamedNode) -> OxiEncodedTerm {
183 let mut interner = self
184 .interner
185 .write()
186 .expect("interner lock should not be poisoned");
187 let iri_hash = interner.intern(node.as_str());
188 OxiEncodedTerm::NamedNode { iri: iri_hash }
189 }
190
191 pub fn encode_blank_node(&self, node: &BlankNode) -> OxiEncodedTerm {
193 let mut interner = self
194 .interner
195 .write()
196 .expect("interner lock should not be poisoned");
197 let id_hash = interner.intern(node.as_str());
198 OxiEncodedTerm::BlankNode { id: id_hash }
199 }
200
201 pub fn encode_literal(&self, literal: &Literal) -> OxiEncodedTerm {
203 let literal_str = literal.value();
204
205 let datatype = literal.datatype();
207 match datatype.as_str() {
208 "http://www.w3.org/2001/XMLSchema#boolean" => {
209 if let Ok(value) = literal_str.parse::<bool>() {
210 return OxiEncodedTerm::BooleanLiteral(value);
211 }
212 }
213 "http://www.w3.org/2001/XMLSchema#integer"
214 | "http://www.w3.org/2001/XMLSchema#int"
215 | "http://www.w3.org/2001/XMLSchema#long" => {
216 if let Ok(value) = literal_str.parse::<i64>() {
217 return OxiEncodedTerm::IntegerLiteral(value);
218 }
219 }
220 "http://www.w3.org/2001/XMLSchema#float" => {
221 if let Ok(value) = literal_str.parse::<f32>() {
222 return OxiEncodedTerm::FloatLiteral(value);
223 }
224 }
225 "http://www.w3.org/2001/XMLSchema#double" => {
226 if let Ok(value) = literal_str.parse::<f64>() {
227 return OxiEncodedTerm::DoubleLiteral(value);
228 }
229 }
230 "http://www.w3.org/2001/XMLSchema#string" => {
231 let mut interner = self
232 .interner
233 .write()
234 .expect("interner lock should not be poisoned");
235 let value_hash = interner.intern(literal_str);
236 return OxiEncodedTerm::StringLiteral(value_hash);
237 }
238 _ => {
239 }
241 }
242
243 let mut interner = self
245 .interner
246 .write()
247 .expect("interner lock should not be poisoned");
248 let value_hash = interner.intern(literal_str);
249
250 let datatype_hash = Some(interner.intern(datatype.as_str()));
251 let language = literal.language().map(|lang| lang.to_string());
252
253 OxiEncodedTerm::Literal {
254 value: value_hash,
255 datatype: datatype_hash,
256 language,
257 }
258 }
259
260 pub fn decode_term(&self, encoded: &OxiEncodedTerm) -> Result<DecodedTerm, String> {
262 let interner = self
263 .interner
264 .read()
265 .expect("interner lock should not be poisoned");
266
267 match encoded {
268 OxiEncodedTerm::DefaultGraph => Ok(DecodedTerm::DefaultGraph),
269
270 OxiEncodedTerm::NamedNode { iri } => {
271 let iri_str = interner
272 .resolve(iri)
273 .ok_or("IRI hash not found in interner")?;
274 Ok(DecodedTerm::NamedNode(iri_str.to_string()))
275 }
276
277 OxiEncodedTerm::BlankNode { id } => {
278 let id_str = interner
279 .resolve(id)
280 .ok_or("Blank node ID hash not found in interner")?;
281 Ok(DecodedTerm::BlankNode(id_str.to_string()))
282 }
283
284 OxiEncodedTerm::BooleanLiteral(value) => Ok(DecodedTerm::Literal {
285 value: value.to_string(),
286 datatype: Some("http://www.w3.org/2001/XMLSchema#boolean".to_string()),
287 language: None,
288 }),
289
290 OxiEncodedTerm::IntegerLiteral(value) => Ok(DecodedTerm::Literal {
291 value: value.to_string(),
292 datatype: Some("http://www.w3.org/2001/XMLSchema#integer".to_string()),
293 language: None,
294 }),
295
296 OxiEncodedTerm::FloatLiteral(value) => Ok(DecodedTerm::Literal {
297 value: value.to_string(),
298 datatype: Some("http://www.w3.org/2001/XMLSchema#float".to_string()),
299 language: None,
300 }),
301
302 OxiEncodedTerm::DoubleLiteral(value) => Ok(DecodedTerm::Literal {
303 value: value.to_string(),
304 datatype: Some("http://www.w3.org/2001/XMLSchema#double".to_string()),
305 language: None,
306 }),
307
308 OxiEncodedTerm::StringLiteral(value_hash) => {
309 let value_str = interner
310 .resolve(value_hash)
311 .ok_or("String literal hash not found in interner")?;
312 Ok(DecodedTerm::Literal {
313 value: value_str.to_string(),
314 datatype: Some("http://www.w3.org/2001/XMLSchema#string".to_string()),
315 language: None,
316 })
317 }
318
319 OxiEncodedTerm::Literal {
320 value,
321 datatype,
322 language,
323 } => {
324 let value_str = interner
325 .resolve(value)
326 .ok_or("Literal value hash not found in interner")?;
327
328 let datatype_str = if let Some(dt_hash) = datatype {
329 Some(
330 interner
331 .resolve(dt_hash)
332 .ok_or("Datatype hash not found in interner")?
333 .to_string(),
334 )
335 } else {
336 None
337 };
338
339 Ok(DecodedTerm::Literal {
340 value: value_str.to_string(),
341 datatype: datatype_str,
342 language: language.clone(),
343 })
344 }
345 }
346 }
347
348 pub fn stats(&self) -> InternerStats {
350 self.interner
351 .read()
352 .expect("interner lock should not be poisoned")
353 .stats()
354 }
355}
356
357impl Default for OptimizedTermEncoder {
358 fn default() -> Self {
359 Self::new()
360 }
361}
362
363#[derive(Debug, Clone, PartialEq)]
365pub enum DecodedTerm {
366 DefaultGraph,
367 NamedNode(String),
368 BlankNode(String),
369 Literal {
370 value: String,
371 datatype: Option<String>,
372 language: Option<String>,
373 },
374}
375
376#[cfg(test)]
377mod tests {
378 use super::*;
379 use crate::model::{Literal, NamedNode};
380
381 #[test]
382 fn test_string_interner() {
383 let mut interner = StringInterner::new();
384
385 let hash1 = interner.intern("http://example.org/test");
386 let hash2 = interner.intern("http://example.org/test"); let hash3 = interner.intern("http://example.org/other");
388
389 assert_eq!(hash1, hash2); assert_ne!(hash1, hash3); assert_eq!(interner.resolve(&hash1), Some("http://example.org/test"));
393 assert_eq!(interner.resolve(&hash3), Some("http://example.org/other"));
394
395 let stats = interner.stats();
396 assert_eq!(stats.total_strings, 2); assert_eq!(stats.deduplication_saves, 1); }
399
400 #[test]
401 fn test_optimized_encoding() -> Result<(), Box<dyn std::error::Error>> {
402 let encoder = OptimizedTermEncoder::new();
403
404 let named_node = NamedNode::new("http://example.org/test")?;
406 let encoded = encoder.encode_named_node(&named_node);
407
408 match encoder.decode_term(&encoded)? {
409 DecodedTerm::NamedNode(iri) => {
410 assert_eq!(iri, "http://example.org/test");
411 }
412 _ => panic!("Expected named node"),
413 }
414
415 let int_literal = Literal::new_typed_literal(
417 "42",
418 NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
419 );
420 let encoded = encoder.encode_literal(&int_literal);
421
422 assert!(matches!(encoded, OxiEncodedTerm::IntegerLiteral(42)));
423
424 let bool_literal = Literal::new_typed_literal(
426 "true",
427 NamedNode::new("http://www.w3.org/2001/XMLSchema#boolean")?,
428 );
429 let encoded = encoder.encode_literal(&bool_literal);
430
431 assert!(matches!(encoded, OxiEncodedTerm::BooleanLiteral(true)));
432
433 Ok(())
434 }
435
436 #[test]
437 fn test_hash_consistency() {
438 let hash1 = OxiStrHash::new("test string");
439 let hash2 = OxiStrHash::new("test string");
440 let hash3 = OxiStrHash::new("different string");
441
442 assert_eq!(hash1, hash2);
443 assert_ne!(hash1, hash3);
444
445 let bytes = hash1.to_be_bytes();
447 let reconstructed = OxiStrHash::from_be_bytes(bytes);
448 assert_eq!(hash1, reconstructed);
449 }
450
451 #[test]
452 fn test_edge_cases_empty_string() {
453 let mut interner = StringInterner::new();
454
455 let empty_hash = interner.intern("");
457 assert_eq!(interner.resolve(&empty_hash), Some(""));
458
459 let empty_hash2 = interner.intern("");
461 assert_eq!(empty_hash, empty_hash2);
462
463 let stats = interner.stats();
464 assert_eq!(stats.total_strings, 1);
465 assert_eq!(stats.deduplication_saves, 1);
466 }
467
468 #[test]
469 fn test_edge_cases_unicode_strings() {
470 let mut interner = StringInterner::new();
471
472 let unicode_test_cases = [
474 "Hello, 世界!",
475 "Ħello, мир!",
476 "🌍🚀✨",
477 "नमस्ते",
478 "مرحبا",
479 "\u{1F4A9}\u{200D}\u{1F4BB}", ];
481
482 for test_case in &unicode_test_cases {
483 let hash = interner.intern(test_case);
484 assert_eq!(interner.resolve(&hash), Some(*test_case));
485 }
486 }
487
488 #[test]
489 fn test_edge_cases_large_strings() {
490 let mut interner = StringInterner::new();
491
492 let large_string = "x".repeat(1_000_000); let hash = interner.intern(&large_string);
495 assert_eq!(interner.resolve(&hash), Some(large_string.as_str()));
496
497 let hash2 = interner.intern(&large_string);
499 assert_eq!(hash, hash2);
500
501 let stats = interner.stats();
502 assert_eq!(stats.deduplication_saves, 1);
503 }
504
505 #[test]
506 fn test_error_conditions_invalid_hashes() {
507 let interner = StringInterner::new();
508
509 let fake_hash = OxiStrHash::from_be_bytes([0xFF; 16]);
511 assert_eq!(interner.resolve(&fake_hash), None);
512 }
513
514 #[test]
515 fn test_error_conditions_decode_failures() -> Result<(), Box<dyn std::error::Error>> {
516 let encoder = OptimizedTermEncoder::new();
517
518 let fake_hash = OxiStrHash::from_be_bytes([0xFF; 16]);
520 let encoded = OxiEncodedTerm::NamedNode { iri: fake_hash };
521
522 assert!(encoder.decode_term(&encoded).is_err());
524
525 Ok(())
526 }
527
528 #[test]
529 fn test_numeric_literal_edge_cases() -> Result<(), Box<dyn std::error::Error>> {
530 let encoder = OptimizedTermEncoder::new();
531
532 let max_int = Literal::new_typed_literal(
534 i64::MAX.to_string(),
535 NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
536 );
537 let encoded = encoder.encode_literal(&max_int);
538 assert!(matches!(encoded, OxiEncodedTerm::IntegerLiteral(i64::MAX)));
539
540 let min_int = Literal::new_typed_literal(
541 i64::MIN.to_string(),
542 NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
543 );
544 let encoded = encoder.encode_literal(&min_int);
545 assert!(matches!(encoded, OxiEncodedTerm::IntegerLiteral(i64::MIN)));
546
547 let nan_float = Literal::new_typed_literal(
549 "NaN",
550 NamedNode::new("http://www.w3.org/2001/XMLSchema#float")?,
551 );
552 let encoded = encoder.encode_literal(&nan_float);
553 if let OxiEncodedTerm::FloatLiteral(val) = encoded {
554 assert!(val.is_nan());
555 } else {
556 panic!("Expected FloatLiteral");
557 }
558
559 let inf_float = Literal::new_typed_literal(
560 "INF",
561 NamedNode::new("http://www.w3.org/2001/XMLSchema#float")?,
562 );
563 let encoded = encoder.encode_literal(&inf_float);
564 if let OxiEncodedTerm::FloatLiteral(val) = encoded {
565 assert!(val.is_infinite() && val.is_sign_positive());
566 } else {
567 panic!("Expected FloatLiteral");
568 }
569
570 Ok(())
571 }
572
573 #[test]
574 fn test_invalid_numeric_literals() -> Result<(), Box<dyn std::error::Error>> {
575 let encoder = OptimizedTermEncoder::new();
576
577 let invalid_int = Literal::new_typed_literal(
579 "not_a_number",
580 NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
581 );
582 let encoded = encoder.encode_literal(&invalid_int);
583 assert!(matches!(encoded, OxiEncodedTerm::Literal { .. }));
584
585 let invalid_float = Literal::new_typed_literal(
587 "not_a_float",
588 NamedNode::new("http://www.w3.org/2001/XMLSchema#float")?,
589 );
590 let encoded = encoder.encode_literal(&invalid_float);
591 assert!(matches!(encoded, OxiEncodedTerm::Literal { .. }));
592
593 Ok(())
594 }
595
596 #[test]
597 fn test_memory_efficiency() {
598 let mut interner = StringInterner::new();
599
600 let test_string = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
602 let num_duplicates = 10000;
603
604 for _ in 0..num_duplicates {
605 interner.intern(test_string);
606 }
607
608 let stats = interner.stats();
609 assert_eq!(stats.total_strings, 1); assert_eq!(stats.deduplication_saves, num_duplicates - 1);
611
612 assert_eq!(stats.memory_usage, test_string.len());
614 }
615
616 #[test]
617 fn test_concurrent_safety_simulation() {
618 use std::sync::Arc;
619 use std::thread;
620
621 let encoder = Arc::new(OptimizedTermEncoder::new());
622 let test_strings = vec![
623 "http://example.org/test1",
624 "http://example.org/test2",
625 "http://example.org/test3",
626 ];
627
628 let handles: Vec<_> = test_strings
629 .into_iter()
630 .enumerate()
631 .map(|(i, s)| {
632 let encoder = Arc::clone(&encoder);
633 let s = s.to_string();
634 thread::spawn(move || {
635 let named_node = NamedNode::new(&s).expect("valid IRI");
637 let encoded = encoder.encode_named_node(&named_node);
638 (i, encoded)
639 })
640 })
641 .collect();
642
643 let results: Vec<_> = handles
645 .into_iter()
646 .map(|h| h.join().expect("thread should not panic"))
647 .collect();
648 assert_eq!(results.len(), 3);
649
650 for (_, encoded) in results {
652 assert!(matches!(encoded, OxiEncodedTerm::NamedNode { .. }));
653 }
654 }
655}