Skip to main content

oxirs_core/model/
optimized_terms.rs

1// Optimized RDF term representations based on Oxigraph's oxrdf optimizations
2// This module provides memory-efficient, hash-based term storage and encoding
3
4use crate::model::{BlankNode, Literal, NamedNode};
5use siphasher::sip128::{Hasher128, SipHasher24};
6use std::collections::HashMap;
7use std::hash::{Hash, Hasher};
8use std::sync::{Arc, RwLock};
9
10/// A 16-byte hash for efficient string deduplication (Oxigraph-inspired optimization)
11#[derive(Eq, PartialEq, Debug, Clone, Copy)]
12pub struct OxiStrHash {
13    hash: [u8; 16],
14}
15
16impl OxiStrHash {
17    pub fn new(value: &str) -> Self {
18        let mut hasher = SipHasher24::new();
19        hasher.write(value.as_bytes());
20        Self {
21            hash: u128::from(hasher.finish128()).to_be_bytes(),
22        }
23    }
24
25    #[inline]
26    pub fn from_be_bytes(hash: [u8; 16]) -> Self {
27        Self { hash }
28    }
29
30    #[inline]
31    pub fn to_be_bytes(self) -> [u8; 16] {
32        self.hash
33    }
34}
35
36impl Hash for OxiStrHash {
37    #[inline]
38    fn hash<H: Hasher>(&self, state: &mut H) {
39        state.write_u128(u128::from_ne_bytes(self.hash))
40    }
41}
42
43/// Compact encoded representation of RDF terms (Oxigraph-inspired optimization)
44#[derive(Debug, Clone, PartialEq)]
45pub enum OxiEncodedTerm {
46    DefaultGraph,
47    NamedNode {
48        iri: OxiStrHash,
49    },
50    BlankNode {
51        id: OxiStrHash,
52    },
53    Literal {
54        value: OxiStrHash,
55        datatype: Option<OxiStrHash>,
56        language: Option<String>,
57    },
58    // Optimized encodings for common literal types
59    BooleanLiteral(bool),
60    IntegerLiteral(i64),
61    FloatLiteral(f32),
62    DoubleLiteral(f64),
63    StringLiteral(OxiStrHash),
64}
65
66impl Hash for OxiEncodedTerm {
67    fn hash<H: Hasher>(&self, state: &mut H) {
68        match self {
69            OxiEncodedTerm::DefaultGraph => {
70                0u8.hash(state);
71            }
72            OxiEncodedTerm::NamedNode { iri } => {
73                1u8.hash(state);
74                iri.hash(state);
75            }
76            OxiEncodedTerm::BlankNode { id } => {
77                2u8.hash(state);
78                id.hash(state);
79            }
80            OxiEncodedTerm::Literal {
81                value,
82                datatype,
83                language,
84            } => {
85                3u8.hash(state);
86                value.hash(state);
87                datatype.hash(state);
88                language.hash(state);
89            }
90            OxiEncodedTerm::BooleanLiteral(value) => {
91                4u8.hash(state);
92                value.hash(state);
93            }
94            OxiEncodedTerm::IntegerLiteral(value) => {
95                5u8.hash(state);
96                value.hash(state);
97            }
98            OxiEncodedTerm::FloatLiteral(value) => {
99                6u8.hash(state);
100                // For floating point values, we use the bit representation for hashing
101                value.to_bits().hash(state);
102            }
103            OxiEncodedTerm::DoubleLiteral(value) => {
104                7u8.hash(state);
105                // For floating point values, we use the bit representation for hashing
106                value.to_bits().hash(state);
107            }
108            OxiEncodedTerm::StringLiteral(value) => {
109                8u8.hash(state);
110                value.hash(state);
111            }
112        }
113    }
114}
115
116impl Eq for OxiEncodedTerm {}
117
118/// High-performance string interner using hash-based deduplication
119#[derive(Debug, Default)]
120pub struct StringInterner {
121    /// Maps hashes to actual strings
122    string_storage: HashMap<OxiStrHash, String>,
123    /// Statistics
124    total_strings: usize,
125    total_deduplication_saves: usize,
126}
127
128impl StringInterner {
129    pub fn new() -> Self {
130        Self::default()
131    }
132
133    /// Intern a string and return its hash
134    pub fn intern(&mut self, value: &str) -> OxiStrHash {
135        let hash = OxiStrHash::new(value);
136
137        if let std::collections::hash_map::Entry::Vacant(e) = self.string_storage.entry(hash) {
138            e.insert(value.to_string());
139            self.total_strings += 1;
140        } else {
141            self.total_deduplication_saves += 1;
142        }
143
144        hash
145    }
146
147    /// Resolve a hash back to its string
148    pub fn resolve(&self, hash: &OxiStrHash) -> Option<&str> {
149        self.string_storage.get(hash).map(|s| s.as_str())
150    }
151
152    /// Get interning statistics
153    pub fn stats(&self) -> InternerStats {
154        InternerStats {
155            total_strings: self.total_strings,
156            deduplication_saves: self.total_deduplication_saves,
157            memory_usage: self.string_storage.values().map(|s| s.len()).sum(),
158        }
159    }
160}
161
162#[derive(Debug, Clone)]
163pub struct InternerStats {
164    pub total_strings: usize,
165    pub deduplication_saves: usize,
166    pub memory_usage: usize,
167}
168
169/// Thread-safe term encoder with optimized storage
170pub struct OptimizedTermEncoder {
171    interner: Arc<RwLock<StringInterner>>,
172}
173
174impl OptimizedTermEncoder {
175    pub fn new() -> Self {
176        Self {
177            interner: Arc::new(RwLock::new(StringInterner::new())),
178        }
179    }
180
181    /// Encode a named node efficiently
182    pub fn encode_named_node(&self, node: &NamedNode) -> OxiEncodedTerm {
183        let mut interner = self
184            .interner
185            .write()
186            .expect("interner lock should not be poisoned");
187        let iri_hash = interner.intern(node.as_str());
188        OxiEncodedTerm::NamedNode { iri: iri_hash }
189    }
190
191    /// Encode a blank node efficiently
192    pub fn encode_blank_node(&self, node: &BlankNode) -> OxiEncodedTerm {
193        let mut interner = self
194            .interner
195            .write()
196            .expect("interner lock should not be poisoned");
197        let id_hash = interner.intern(node.as_str());
198        OxiEncodedTerm::BlankNode { id: id_hash }
199    }
200
201    /// Encode a literal with type-specific optimizations
202    pub fn encode_literal(&self, literal: &Literal) -> OxiEncodedTerm {
203        let literal_str = literal.value();
204
205        // Try type-specific optimizations first
206        let datatype = literal.datatype();
207        match datatype.as_str() {
208            "http://www.w3.org/2001/XMLSchema#boolean" => {
209                if let Ok(value) = literal_str.parse::<bool>() {
210                    return OxiEncodedTerm::BooleanLiteral(value);
211                }
212            }
213            "http://www.w3.org/2001/XMLSchema#integer"
214            | "http://www.w3.org/2001/XMLSchema#int"
215            | "http://www.w3.org/2001/XMLSchema#long" => {
216                if let Ok(value) = literal_str.parse::<i64>() {
217                    return OxiEncodedTerm::IntegerLiteral(value);
218                }
219            }
220            "http://www.w3.org/2001/XMLSchema#float" => {
221                if let Ok(value) = literal_str.parse::<f32>() {
222                    return OxiEncodedTerm::FloatLiteral(value);
223                }
224            }
225            "http://www.w3.org/2001/XMLSchema#double" => {
226                if let Ok(value) = literal_str.parse::<f64>() {
227                    return OxiEncodedTerm::DoubleLiteral(value);
228                }
229            }
230            "http://www.w3.org/2001/XMLSchema#string" => {
231                let mut interner = self
232                    .interner
233                    .write()
234                    .expect("interner lock should not be poisoned");
235                let value_hash = interner.intern(literal_str);
236                return OxiEncodedTerm::StringLiteral(value_hash);
237            }
238            _ => {
239                // Fall through to general encoding
240            }
241        }
242
243        // General literal encoding
244        let mut interner = self
245            .interner
246            .write()
247            .expect("interner lock should not be poisoned");
248        let value_hash = interner.intern(literal_str);
249
250        let datatype_hash = Some(interner.intern(datatype.as_str()));
251        let language = literal.language().map(|lang| lang.to_string());
252
253        OxiEncodedTerm::Literal {
254            value: value_hash,
255            datatype: datatype_hash,
256            language,
257        }
258    }
259
260    /// Decode an encoded term back to its original form
261    pub fn decode_term(&self, encoded: &OxiEncodedTerm) -> Result<DecodedTerm, String> {
262        let interner = self
263            .interner
264            .read()
265            .expect("interner lock should not be poisoned");
266
267        match encoded {
268            OxiEncodedTerm::DefaultGraph => Ok(DecodedTerm::DefaultGraph),
269
270            OxiEncodedTerm::NamedNode { iri } => {
271                let iri_str = interner
272                    .resolve(iri)
273                    .ok_or("IRI hash not found in interner")?;
274                Ok(DecodedTerm::NamedNode(iri_str.to_string()))
275            }
276
277            OxiEncodedTerm::BlankNode { id } => {
278                let id_str = interner
279                    .resolve(id)
280                    .ok_or("Blank node ID hash not found in interner")?;
281                Ok(DecodedTerm::BlankNode(id_str.to_string()))
282            }
283
284            OxiEncodedTerm::BooleanLiteral(value) => Ok(DecodedTerm::Literal {
285                value: value.to_string(),
286                datatype: Some("http://www.w3.org/2001/XMLSchema#boolean".to_string()),
287                language: None,
288            }),
289
290            OxiEncodedTerm::IntegerLiteral(value) => Ok(DecodedTerm::Literal {
291                value: value.to_string(),
292                datatype: Some("http://www.w3.org/2001/XMLSchema#integer".to_string()),
293                language: None,
294            }),
295
296            OxiEncodedTerm::FloatLiteral(value) => Ok(DecodedTerm::Literal {
297                value: value.to_string(),
298                datatype: Some("http://www.w3.org/2001/XMLSchema#float".to_string()),
299                language: None,
300            }),
301
302            OxiEncodedTerm::DoubleLiteral(value) => Ok(DecodedTerm::Literal {
303                value: value.to_string(),
304                datatype: Some("http://www.w3.org/2001/XMLSchema#double".to_string()),
305                language: None,
306            }),
307
308            OxiEncodedTerm::StringLiteral(value_hash) => {
309                let value_str = interner
310                    .resolve(value_hash)
311                    .ok_or("String literal hash not found in interner")?;
312                Ok(DecodedTerm::Literal {
313                    value: value_str.to_string(),
314                    datatype: Some("http://www.w3.org/2001/XMLSchema#string".to_string()),
315                    language: None,
316                })
317            }
318
319            OxiEncodedTerm::Literal {
320                value,
321                datatype,
322                language,
323            } => {
324                let value_str = interner
325                    .resolve(value)
326                    .ok_or("Literal value hash not found in interner")?;
327
328                let datatype_str = if let Some(dt_hash) = datatype {
329                    Some(
330                        interner
331                            .resolve(dt_hash)
332                            .ok_or("Datatype hash not found in interner")?
333                            .to_string(),
334                    )
335                } else {
336                    None
337                };
338
339                Ok(DecodedTerm::Literal {
340                    value: value_str.to_string(),
341                    datatype: datatype_str,
342                    language: language.clone(),
343                })
344            }
345        }
346    }
347
348    /// Get interner statistics
349    pub fn stats(&self) -> InternerStats {
350        self.interner
351            .read()
352            .expect("interner lock should not be poisoned")
353            .stats()
354    }
355}
356
357impl Default for OptimizedTermEncoder {
358    fn default() -> Self {
359        Self::new()
360    }
361}
362
363/// Decoded term representation for reconstruction
364#[derive(Debug, Clone, PartialEq)]
365pub enum DecodedTerm {
366    DefaultGraph,
367    NamedNode(String),
368    BlankNode(String),
369    Literal {
370        value: String,
371        datatype: Option<String>,
372        language: Option<String>,
373    },
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379    use crate::model::{Literal, NamedNode};
380
381    #[test]
382    fn test_string_interner() {
383        let mut interner = StringInterner::new();
384
385        let hash1 = interner.intern("http://example.org/test");
386        let hash2 = interner.intern("http://example.org/test"); // Same string
387        let hash3 = interner.intern("http://example.org/other");
388
389        assert_eq!(hash1, hash2); // Same hash for same string
390        assert_ne!(hash1, hash3); // Different hash for different string
391
392        assert_eq!(interner.resolve(&hash1), Some("http://example.org/test"));
393        assert_eq!(interner.resolve(&hash3), Some("http://example.org/other"));
394
395        let stats = interner.stats();
396        assert_eq!(stats.total_strings, 2); // Only 2 unique strings
397        assert_eq!(stats.deduplication_saves, 1); // 1 deduplication
398    }
399
400    #[test]
401    fn test_optimized_encoding() -> Result<(), Box<dyn std::error::Error>> {
402        let encoder = OptimizedTermEncoder::new();
403
404        // Test named node encoding
405        let named_node = NamedNode::new("http://example.org/test")?;
406        let encoded = encoder.encode_named_node(&named_node);
407
408        match encoder.decode_term(&encoded)? {
409            DecodedTerm::NamedNode(iri) => {
410                assert_eq!(iri, "http://example.org/test");
411            }
412            _ => panic!("Expected named node"),
413        }
414
415        // Test optimized integer literal
416        let int_literal = Literal::new_typed_literal(
417            "42",
418            NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
419        );
420        let encoded = encoder.encode_literal(&int_literal);
421
422        assert!(matches!(encoded, OxiEncodedTerm::IntegerLiteral(42)));
423
424        // Test optimized boolean literal
425        let bool_literal = Literal::new_typed_literal(
426            "true",
427            NamedNode::new("http://www.w3.org/2001/XMLSchema#boolean")?,
428        );
429        let encoded = encoder.encode_literal(&bool_literal);
430
431        assert!(matches!(encoded, OxiEncodedTerm::BooleanLiteral(true)));
432
433        Ok(())
434    }
435
436    #[test]
437    fn test_hash_consistency() {
438        let hash1 = OxiStrHash::new("test string");
439        let hash2 = OxiStrHash::new("test string");
440        let hash3 = OxiStrHash::new("different string");
441
442        assert_eq!(hash1, hash2);
443        assert_ne!(hash1, hash3);
444
445        // Test byte conversion
446        let bytes = hash1.to_be_bytes();
447        let reconstructed = OxiStrHash::from_be_bytes(bytes);
448        assert_eq!(hash1, reconstructed);
449    }
450
451    #[test]
452    fn test_edge_cases_empty_string() {
453        let mut interner = StringInterner::new();
454
455        // Test empty string handling
456        let empty_hash = interner.intern("");
457        assert_eq!(interner.resolve(&empty_hash), Some(""));
458
459        // Test multiple empty strings (should deduplicate)
460        let empty_hash2 = interner.intern("");
461        assert_eq!(empty_hash, empty_hash2);
462
463        let stats = interner.stats();
464        assert_eq!(stats.total_strings, 1);
465        assert_eq!(stats.deduplication_saves, 1);
466    }
467
468    #[test]
469    fn test_edge_cases_unicode_strings() {
470        let mut interner = StringInterner::new();
471
472        // Test Unicode strings
473        let unicode_test_cases = [
474            "Hello, 世界!",
475            "Ħello, мир!",
476            "🌍🚀✨",
477            "नमस्ते",
478            "مرحبا",
479            "\u{1F4A9}\u{200D}\u{1F4BB}", // Complex emoji sequence
480        ];
481
482        for test_case in &unicode_test_cases {
483            let hash = interner.intern(test_case);
484            assert_eq!(interner.resolve(&hash), Some(*test_case));
485        }
486    }
487
488    #[test]
489    fn test_edge_cases_large_strings() {
490        let mut interner = StringInterner::new();
491
492        // Test very large strings
493        let large_string = "x".repeat(1_000_000); // 1MB string
494        let hash = interner.intern(&large_string);
495        assert_eq!(interner.resolve(&hash), Some(large_string.as_str()));
496
497        // Test deduplication of large strings
498        let hash2 = interner.intern(&large_string);
499        assert_eq!(hash, hash2);
500
501        let stats = interner.stats();
502        assert_eq!(stats.deduplication_saves, 1);
503    }
504
505    #[test]
506    fn test_error_conditions_invalid_hashes() {
507        let interner = StringInterner::new();
508
509        // Test resolving non-existent hash
510        let fake_hash = OxiStrHash::from_be_bytes([0xFF; 16]);
511        assert_eq!(interner.resolve(&fake_hash), None);
512    }
513
514    #[test]
515    fn test_error_conditions_decode_failures() -> Result<(), Box<dyn std::error::Error>> {
516        let encoder = OptimizedTermEncoder::new();
517
518        // Create an encoded term with a hash that doesn't exist in the interner
519        let fake_hash = OxiStrHash::from_be_bytes([0xFF; 16]);
520        let encoded = OxiEncodedTerm::NamedNode { iri: fake_hash };
521
522        // This should fail when trying to decode
523        assert!(encoder.decode_term(&encoded).is_err());
524
525        Ok(())
526    }
527
528    #[test]
529    fn test_numeric_literal_edge_cases() -> Result<(), Box<dyn std::error::Error>> {
530        let encoder = OptimizedTermEncoder::new();
531
532        // Test integer boundary values
533        let max_int = Literal::new_typed_literal(
534            i64::MAX.to_string(),
535            NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
536        );
537        let encoded = encoder.encode_literal(&max_int);
538        assert!(matches!(encoded, OxiEncodedTerm::IntegerLiteral(i64::MAX)));
539
540        let min_int = Literal::new_typed_literal(
541            i64::MIN.to_string(),
542            NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
543        );
544        let encoded = encoder.encode_literal(&min_int);
545        assert!(matches!(encoded, OxiEncodedTerm::IntegerLiteral(i64::MIN)));
546
547        // Test float special values
548        let nan_float = Literal::new_typed_literal(
549            "NaN",
550            NamedNode::new("http://www.w3.org/2001/XMLSchema#float")?,
551        );
552        let encoded = encoder.encode_literal(&nan_float);
553        if let OxiEncodedTerm::FloatLiteral(val) = encoded {
554            assert!(val.is_nan());
555        } else {
556            panic!("Expected FloatLiteral");
557        }
558
559        let inf_float = Literal::new_typed_literal(
560            "INF",
561            NamedNode::new("http://www.w3.org/2001/XMLSchema#float")?,
562        );
563        let encoded = encoder.encode_literal(&inf_float);
564        if let OxiEncodedTerm::FloatLiteral(val) = encoded {
565            assert!(val.is_infinite() && val.is_sign_positive());
566        } else {
567            panic!("Expected FloatLiteral");
568        }
569
570        Ok(())
571    }
572
573    #[test]
574    fn test_invalid_numeric_literals() -> Result<(), Box<dyn std::error::Error>> {
575        let encoder = OptimizedTermEncoder::new();
576
577        // Test invalid integer (should fall back to general literal encoding)
578        let invalid_int = Literal::new_typed_literal(
579            "not_a_number",
580            NamedNode::new("http://www.w3.org/2001/XMLSchema#integer")?,
581        );
582        let encoded = encoder.encode_literal(&invalid_int);
583        assert!(matches!(encoded, OxiEncodedTerm::Literal { .. }));
584
585        // Test invalid float (should fall back to general literal encoding)
586        let invalid_float = Literal::new_typed_literal(
587            "not_a_float",
588            NamedNode::new("http://www.w3.org/2001/XMLSchema#float")?,
589        );
590        let encoded = encoder.encode_literal(&invalid_float);
591        assert!(matches!(encoded, OxiEncodedTerm::Literal { .. }));
592
593        Ok(())
594    }
595
596    #[test]
597    fn test_memory_efficiency() {
598        let mut interner = StringInterner::new();
599
600        // Intern many duplicate strings to test memory efficiency
601        let test_string = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
602        let num_duplicates = 10000;
603
604        for _ in 0..num_duplicates {
605            interner.intern(test_string);
606        }
607
608        let stats = interner.stats();
609        assert_eq!(stats.total_strings, 1); // Only one unique string
610        assert_eq!(stats.deduplication_saves, num_duplicates - 1);
611
612        // Memory usage should be just the size of one string
613        assert_eq!(stats.memory_usage, test_string.len());
614    }
615
616    #[test]
617    fn test_concurrent_safety_simulation() {
618        use std::sync::Arc;
619        use std::thread;
620
621        let encoder = Arc::new(OptimizedTermEncoder::new());
622        let test_strings = vec![
623            "http://example.org/test1",
624            "http://example.org/test2",
625            "http://example.org/test3",
626        ];
627
628        let handles: Vec<_> = test_strings
629            .into_iter()
630            .enumerate()
631            .map(|(i, s)| {
632                let encoder = Arc::clone(&encoder);
633                let s = s.to_string();
634                thread::spawn(move || {
635                    // Simulate concurrent access
636                    let named_node = NamedNode::new(&s).expect("valid IRI");
637                    let encoded = encoder.encode_named_node(&named_node);
638                    (i, encoded)
639                })
640            })
641            .collect();
642
643        // Wait for all threads and collect results
644        let results: Vec<_> = handles
645            .into_iter()
646            .map(|h| h.join().expect("thread should not panic"))
647            .collect();
648        assert_eq!(results.len(), 3);
649
650        // Verify all encodings are valid
651        for (_, encoded) in results {
652            assert!(matches!(encoded, OxiEncodedTerm::NamedNode { .. }));
653        }
654    }
655}