Skip to main content

oxirs_core/store/
encoding.rs

1//! Efficient encoding for RDF terms
2//!
3//! This implementation is extracted and adapted from Oxigraph's numeric_encoder.rs
4//! to provide zero-dependency term encoding with hash-based optimization.
5
6use crate::model::{
7    BlankNode, BlankNodeRef, Literal, LiteralRef, NamedNode, NamedNodeRef, Term, TermRef,
8};
9use serde::{Deserialize, Serialize};
10use std::collections::hash_map::DefaultHasher;
11use std::fmt::{Debug, Display};
12use std::hash::{Hash, Hasher};
13
14/// A hash of a string for efficient storage and comparison
15#[derive(Eq, PartialEq, Debug, Clone, Copy, Serialize, Deserialize)]
16pub struct StrHash {
17    hash: [u8; 16],
18}
19
20impl StrHash {
21    /// Creates a new string hash using a fast hashing algorithm
22    pub fn new(value: &str) -> Self {
23        let mut hasher = DefaultHasher::new();
24        hasher.write(value.as_bytes());
25        let hash_value = hasher.finish();
26
27        // Create a 16-byte hash by using the 8-byte hash twice with different salts
28        let mut full_hash = [0u8; 16];
29        full_hash[0..8].copy_from_slice(&hash_value.to_be_bytes());
30
31        // Create second hash with salt for better distribution
32        let mut hasher2 = DefaultHasher::new();
33        hasher2.write(&[0xDE, 0xAD, 0xBE, 0xEF]); // Salt
34        hasher2.write(value.as_bytes());
35        let hash_value2 = hasher2.finish();
36        full_hash[8..16].copy_from_slice(&hash_value2.to_be_bytes());
37
38        Self { hash: full_hash }
39    }
40
41    /// Creates a StrHash from raw bytes
42    #[inline]
43    pub fn from_be_bytes(hash: [u8; 16]) -> Self {
44        Self { hash }
45    }
46
47    /// Returns the hash as raw bytes
48    #[inline]
49    pub fn to_be_bytes(self) -> [u8; 16] {
50        self.hash
51    }
52}
53
54impl Hash for StrHash {
55    #[inline]
56    fn hash<H: Hasher>(&self, state: &mut H) {
57        // Use the first 8 bytes as the hash value
58        let hash_val = u64::from_be_bytes([
59            self.hash[0],
60            self.hash[1],
61            self.hash[2],
62            self.hash[3],
63            self.hash[4],
64            self.hash[5],
65            self.hash[6],
66            self.hash[7],
67        ]);
68        state.write_u64(hash_val);
69    }
70}
71
72impl Display for StrHash {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        write!(f, "StrHash({})", hex::encode(self.hash))
75    }
76}
77
78/// Small string optimization for commonly used strings
79/// Stores strings up to 15 bytes inline without allocation
80#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
81pub struct SmallString {
82    data: [u8; 16],
83    len: u8,
84}
85
86impl SmallString {
87    const MAX_INLINE_LEN: usize = 15;
88
89    /// Creates a new SmallString from a string slice
90    pub fn new(s: &str) -> Option<Self> {
91        if s.len() > Self::MAX_INLINE_LEN {
92            return None;
93        }
94
95        let mut data = [0u8; 16];
96        data[..s.len()].copy_from_slice(s.as_bytes());
97
98        Some(SmallString {
99            data,
100            len: s.len() as u8,
101        })
102    }
103
104    /// Returns the string slice
105    pub fn as_str(&self) -> &str {
106        unsafe { std::str::from_utf8_unchecked(&self.data[..self.len as usize]) }
107    }
108
109    /// Returns the length of the string
110    pub fn len(&self) -> usize {
111        self.len as usize
112    }
113
114    /// Returns true if the string is empty
115    pub fn is_empty(&self) -> bool {
116        self.len == 0
117    }
118}
119
120impl Display for SmallString {
121    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122        write!(f, "{}", self.as_str())
123    }
124}
125
126impl From<&str> for SmallString {
127    fn from(s: &str) -> Self {
128        Self::new(s).expect("String too long for SmallString")
129    }
130}
131
132/// Encoded term representation for efficient storage
133#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
134pub enum EncodedTerm {
135    /// Default graph (for quad-based stores)
136    DefaultGraph,
137
138    /// Named node with hashed IRI
139    NamedNode { iri_id: StrHash },
140
141    /// Blank node with numerical ID (16 bytes for uniqueness)
142    NumericalBlankNode { id: [u8; 16] },
143
144    /// Small blank node with inline string
145    SmallBlankNode(SmallString),
146
147    /// Large blank node with hashed ID
148    BigBlankNode { id_id: StrHash },
149
150    /// Small string literal (inline)
151    SmallStringLiteral(SmallString),
152
153    /// Large string literal (hashed)
154    BigStringLiteral { value_id: StrHash },
155
156    /// Small value, small language tag
157    SmallSmallLangStringLiteral {
158        value: SmallString,
159        language: SmallString,
160    },
161
162    /// Small value, large language tag
163    SmallBigLangStringLiteral {
164        value: SmallString,
165        language_id: StrHash,
166    },
167
168    /// Large value, small language tag
169    BigSmallLangStringLiteral {
170        value_id: StrHash,
171        language: SmallString,
172    },
173
174    /// Large value, large language tag
175    BigBigLangStringLiteral {
176        value_id: StrHash,
177        language_id: StrHash,
178    },
179
180    /// Typed literal with small value and small datatype
181    SmallSmallTypedLiteral {
182        value: SmallString,
183        datatype: SmallString,
184    },
185
186    /// Typed literal with small value and hashed datatype
187    SmallBigTypedLiteral {
188        value: SmallString,
189        datatype_id: StrHash,
190    },
191
192    /// Typed literal with hashed value and small datatype
193    BigSmallTypedLiteral {
194        value_id: StrHash,
195        datatype: SmallString,
196    },
197
198    /// Typed literal with hashed value and hashed datatype
199    BigBigTypedLiteral {
200        value_id: StrHash,
201        datatype_id: StrHash,
202    },
203
204    /// Quoted triple (RDF-star)
205    QuotedTriple {
206        subject: Box<EncodedTerm>,
207        predicate: Box<EncodedTerm>,
208        object: Box<EncodedTerm>,
209    },
210}
211
212impl EncodedTerm {
213    /// Encodes a named node
214    pub fn encode_named_node(node: &NamedNode) -> Self {
215        EncodedTerm::NamedNode {
216            iri_id: StrHash::new(node.as_str()),
217        }
218    }
219
220    /// Encodes a named node reference
221    pub fn encode_named_node_ref(node: NamedNodeRef<'_>) -> Self {
222        EncodedTerm::NamedNode {
223            iri_id: StrHash::new(node.as_str()),
224        }
225    }
226
227    /// Encodes a blank node
228    pub fn encode_blank_node(node: &BlankNode) -> Self {
229        let id_str = node.as_str();
230
231        // Try to use numerical representation if it's a hex ID
232        if let Ok(bytes) = hex::decode(id_str) {
233            if bytes.len() == 16 {
234                let mut id = [0u8; 16];
235                id.copy_from_slice(&bytes);
236                return EncodedTerm::NumericalBlankNode { id };
237            }
238        }
239
240        // Use string representation
241        if let Some(small_string) = SmallString::new(id_str) {
242            EncodedTerm::SmallBlankNode(small_string)
243        } else {
244            EncodedTerm::BigBlankNode {
245                id_id: StrHash::new(id_str),
246            }
247        }
248    }
249
250    /// Encodes a blank node reference
251    pub fn encode_blank_node_ref(node: BlankNodeRef<'_>) -> Self {
252        let id_str = node.as_str();
253
254        // Try to use numerical representation if it's a hex ID
255        if let Ok(bytes) = hex::decode(id_str) {
256            if bytes.len() == 16 {
257                let mut id = [0u8; 16];
258                id.copy_from_slice(&bytes);
259                return EncodedTerm::NumericalBlankNode { id };
260            }
261        }
262
263        // Use string representation
264        if let Some(small_string) = SmallString::new(id_str) {
265            EncodedTerm::SmallBlankNode(small_string)
266        } else {
267            EncodedTerm::BigBlankNode {
268                id_id: StrHash::new(id_str),
269            }
270        }
271    }
272
273    /// Encodes a literal
274    pub fn encode_literal(literal: &Literal) -> Self {
275        let value = literal.value();
276
277        if let Some(language) = literal.language() {
278            // Language-tagged string literal
279            match (SmallString::new(value), SmallString::new(language)) {
280                (Some(small_value), Some(small_lang)) => EncodedTerm::SmallSmallLangStringLiteral {
281                    value: small_value,
282                    language: small_lang,
283                },
284                (Some(small_value), None) => EncodedTerm::SmallBigLangStringLiteral {
285                    value: small_value,
286                    language_id: StrHash::new(language),
287                },
288                (None, Some(small_lang)) => EncodedTerm::BigSmallLangStringLiteral {
289                    value_id: StrHash::new(value),
290                    language: small_lang,
291                },
292                (None, None) => EncodedTerm::BigBigLangStringLiteral {
293                    value_id: StrHash::new(value),
294                    language_id: StrHash::new(language),
295                },
296            }
297        } else {
298            let datatype = literal.datatype();
299            let datatype_str = datatype.as_str();
300
301            // Check if it's a simple string literal (xsd:string)
302            if datatype_str == "http://www.w3.org/2001/XMLSchema#string" {
303                if let Some(small_value) = SmallString::new(value) {
304                    EncodedTerm::SmallStringLiteral(small_value)
305                } else {
306                    EncodedTerm::BigStringLiteral {
307                        value_id: StrHash::new(value),
308                    }
309                }
310            } else {
311                // Typed literal
312                match (SmallString::new(value), SmallString::new(datatype_str)) {
313                    (Some(small_value), Some(small_datatype)) => {
314                        EncodedTerm::SmallSmallTypedLiteral {
315                            value: small_value,
316                            datatype: small_datatype,
317                        }
318                    }
319                    (Some(small_value), None) => EncodedTerm::SmallBigTypedLiteral {
320                        value: small_value,
321                        datatype_id: StrHash::new(datatype_str),
322                    },
323                    (None, Some(small_datatype)) => EncodedTerm::BigSmallTypedLiteral {
324                        value_id: StrHash::new(value),
325                        datatype: small_datatype,
326                    },
327                    (None, None) => EncodedTerm::BigBigTypedLiteral {
328                        value_id: StrHash::new(value),
329                        datatype_id: StrHash::new(datatype_str),
330                    },
331                }
332            }
333        }
334    }
335
336    /// Encodes a literal reference
337    pub fn encode_literal_ref(literal: LiteralRef<'_>) -> Self {
338        let value = literal.value();
339
340        if let Some(language) = literal.language() {
341            // Language-tagged string literal
342            match (SmallString::new(value), SmallString::new(language)) {
343                (Some(small_value), Some(small_lang)) => EncodedTerm::SmallSmallLangStringLiteral {
344                    value: small_value,
345                    language: small_lang,
346                },
347                (Some(small_value), None) => EncodedTerm::SmallBigLangStringLiteral {
348                    value: small_value,
349                    language_id: StrHash::new(language),
350                },
351                (None, Some(small_lang)) => EncodedTerm::BigSmallLangStringLiteral {
352                    value_id: StrHash::new(value),
353                    language: small_lang,
354                },
355                (None, None) => EncodedTerm::BigBigLangStringLiteral {
356                    value_id: StrHash::new(value),
357                    language_id: StrHash::new(language),
358                },
359            }
360        } else {
361            let datatype = literal.datatype();
362            let datatype_str = datatype.as_str();
363
364            // Check if it's a simple string literal (xsd:string)
365            if datatype_str == "http://www.w3.org/2001/XMLSchema#string" {
366                if let Some(small_value) = SmallString::new(value) {
367                    EncodedTerm::SmallStringLiteral(small_value)
368                } else {
369                    EncodedTerm::BigStringLiteral {
370                        value_id: StrHash::new(value),
371                    }
372                }
373            } else {
374                // Typed literal
375                match (SmallString::new(value), SmallString::new(datatype_str)) {
376                    (Some(small_value), Some(small_datatype)) => {
377                        EncodedTerm::SmallSmallTypedLiteral {
378                            value: small_value,
379                            datatype: small_datatype,
380                        }
381                    }
382                    (Some(small_value), None) => EncodedTerm::SmallBigTypedLiteral {
383                        value: small_value,
384                        datatype_id: StrHash::new(datatype_str),
385                    },
386                    (None, Some(small_datatype)) => EncodedTerm::BigSmallTypedLiteral {
387                        value_id: StrHash::new(value),
388                        datatype: small_datatype,
389                    },
390                    (None, None) => EncodedTerm::BigBigTypedLiteral {
391                        value_id: StrHash::new(value),
392                        datatype_id: StrHash::new(datatype_str),
393                    },
394                }
395            }
396        }
397    }
398
399    /// Encodes a variable (currently not supported in storage context)
400    pub fn encode_variable(_variable: &crate::model::Variable) -> Self {
401        panic!("Variables cannot be encoded for storage - they are only used in queries")
402    }
403
404    /// Encodes a quoted triple (RDF-star)
405    pub fn encode_quoted_triple(quoted_triple: &crate::model::star::QuotedTriple) -> Self {
406        let inner = quoted_triple.inner();
407        EncodedTerm::QuotedTriple {
408            subject: Box::new(Self::encode_term(&Term::from(inner.subject().clone()))),
409            predicate: Box::new(Self::encode_term(&Term::from(inner.predicate().clone()))),
410            object: Box::new(Self::encode_term(&Term::from(inner.object().clone()))),
411        }
412    }
413
414    /// Encodes any term
415    pub fn encode_term(term: &Term) -> Self {
416        match term {
417            Term::NamedNode(n) => Self::encode_named_node(n),
418            Term::BlankNode(b) => Self::encode_blank_node(b),
419            Term::Literal(l) => Self::encode_literal(l),
420            Term::Variable(_) => panic!("Cannot encode variable in this context"),
421            Term::QuotedTriple(qt) => Self::encode_quoted_triple(qt),
422        }
423    }
424
425    /// Encodes any term reference
426    pub fn encode_term_ref(term: TermRef<'_>) -> Self {
427        match term {
428            TermRef::NamedNode(n) => Self::encode_named_node_ref(n),
429            TermRef::BlankNode(b) => Self::encode_blank_node_ref(b),
430            TermRef::Literal(l) => Self::encode_literal_ref(l),
431            TermRef::Variable(v) => Self::encode_variable(v),
432            #[cfg(feature = "rdf-star")]
433            TermRef::Triple(qt) => Self::encode_quoted_triple(qt),
434        }
435    }
436
437    /// Returns the type discriminant for sorting and indexing
438    pub fn type_discriminant(&self) -> u8 {
439        match self {
440            EncodedTerm::DefaultGraph => 0,
441            EncodedTerm::NamedNode { .. } => 1,
442            EncodedTerm::NumericalBlankNode { .. } => 2,
443            EncodedTerm::SmallBlankNode(_) => 3,
444            EncodedTerm::BigBlankNode { .. } => 4,
445            EncodedTerm::SmallStringLiteral(_) => 5,
446            EncodedTerm::BigStringLiteral { .. } => 6,
447            EncodedTerm::SmallSmallLangStringLiteral { .. } => 7,
448            EncodedTerm::SmallBigLangStringLiteral { .. } => 8,
449            EncodedTerm::BigSmallLangStringLiteral { .. } => 9,
450            EncodedTerm::BigBigLangStringLiteral { .. } => 10,
451            EncodedTerm::SmallSmallTypedLiteral { .. } => 11,
452            EncodedTerm::SmallBigTypedLiteral { .. } => 12,
453            EncodedTerm::BigSmallTypedLiteral { .. } => 13,
454            EncodedTerm::BigBigTypedLiteral { .. } => 14,
455            EncodedTerm::QuotedTriple { .. } => 15,
456        }
457    }
458
459    /// Returns true if this is a named node
460    pub fn is_named_node(&self) -> bool {
461        matches!(self, EncodedTerm::NamedNode { .. })
462    }
463
464    /// Returns true if this is a blank node
465    pub fn is_blank_node(&self) -> bool {
466        matches!(
467            self,
468            EncodedTerm::NumericalBlankNode { .. }
469                | EncodedTerm::SmallBlankNode(_)
470                | EncodedTerm::BigBlankNode { .. }
471        )
472    }
473
474    /// Returns true if this is a literal
475    pub fn is_literal(&self) -> bool {
476        matches!(
477            self,
478            EncodedTerm::SmallStringLiteral(_)
479                | EncodedTerm::BigStringLiteral { .. }
480                | EncodedTerm::SmallSmallLangStringLiteral { .. }
481                | EncodedTerm::SmallBigLangStringLiteral { .. }
482                | EncodedTerm::BigSmallLangStringLiteral { .. }
483                | EncodedTerm::BigBigLangStringLiteral { .. }
484                | EncodedTerm::SmallSmallTypedLiteral { .. }
485                | EncodedTerm::SmallBigTypedLiteral { .. }
486                | EncodedTerm::BigSmallTypedLiteral { .. }
487                | EncodedTerm::BigBigTypedLiteral { .. }
488        )
489    }
490
491    /// Returns true if this is a quoted triple (RDF-star)
492    pub fn is_quoted_triple(&self) -> bool {
493        matches!(self, EncodedTerm::QuotedTriple { .. })
494    }
495
496    /// Returns the size in bytes of this encoded term
497    pub fn size_hint(&self) -> usize {
498        match self {
499            EncodedTerm::DefaultGraph => 1,
500            EncodedTerm::NamedNode { .. } => 1 + 16,
501            EncodedTerm::NumericalBlankNode { .. } => 1 + 16,
502            EncodedTerm::SmallBlankNode(_) => 1 + 16 + 1,
503            EncodedTerm::BigBlankNode { .. } => 1 + 16,
504            EncodedTerm::SmallStringLiteral(_) => 1 + 16 + 1,
505            EncodedTerm::BigStringLiteral { .. } => 1 + 16,
506            EncodedTerm::SmallSmallLangStringLiteral { .. } => 1 + 16 + 1 + 16 + 1,
507            EncodedTerm::SmallBigLangStringLiteral { .. } => 1 + 16 + 1 + 16,
508            EncodedTerm::BigSmallLangStringLiteral { .. } => 1 + 16 + 16 + 1,
509            EncodedTerm::BigBigLangStringLiteral { .. } => 1 + 16 + 16,
510            EncodedTerm::SmallSmallTypedLiteral { .. } => 1 + 16 + 1 + 16 + 1,
511            EncodedTerm::SmallBigTypedLiteral { .. } => 1 + 16 + 1 + 16,
512            EncodedTerm::BigSmallTypedLiteral { .. } => 1 + 16 + 16 + 1,
513            EncodedTerm::BigBigTypedLiteral { .. } => 1 + 16 + 16,
514            EncodedTerm::QuotedTriple {
515                subject,
516                predicate,
517                object,
518            } => 1 + subject.size_hint() + predicate.size_hint() + object.size_hint(),
519        }
520    }
521}
522
523/// Encoded triple for efficient storage and indexing
524#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
525pub struct EncodedTriple {
526    pub subject: EncodedTerm,
527    pub predicate: EncodedTerm,
528    pub object: EncodedTerm,
529}
530
531impl EncodedTriple {
532    /// Creates a new encoded triple
533    pub fn new(subject: EncodedTerm, predicate: EncodedTerm, object: EncodedTerm) -> Self {
534        Self {
535            subject,
536            predicate,
537            object,
538        }
539    }
540
541    /// Returns the size hint for this triple
542    pub fn size_hint(&self) -> usize {
543        self.subject.size_hint() + self.predicate.size_hint() + self.object.size_hint()
544    }
545}
546
547/// Encoded quad for efficient storage and indexing
548#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
549pub struct EncodedQuad {
550    pub subject: EncodedTerm,
551    pub predicate: EncodedTerm,
552    pub object: EncodedTerm,
553    pub graph_name: EncodedTerm,
554}
555
556impl EncodedQuad {
557    /// Creates a new encoded quad
558    pub fn new(
559        subject: EncodedTerm,
560        predicate: EncodedTerm,
561        object: EncodedTerm,
562        graph_name: EncodedTerm,
563    ) -> Self {
564        Self {
565            subject,
566            predicate,
567            object,
568            graph_name,
569        }
570    }
571
572    /// Returns the size hint for this quad
573    pub fn size_hint(&self) -> usize {
574        self.subject.size_hint()
575            + self.predicate.size_hint()
576            + self.object.size_hint()
577            + self.graph_name.size_hint()
578    }
579}
580
581#[cfg(test)]
582mod tests {
583    use super::*;
584    use crate::model::*;
585    use crate::vocab::xsd;
586
587    #[test]
588    fn test_str_hash() {
589        let hash1 = StrHash::new("http://example.org/test");
590        let hash2 = StrHash::new("http://example.org/test");
591        let hash3 = StrHash::new("http://example.org/different");
592
593        assert_eq!(hash1, hash2);
594        assert_ne!(hash1, hash3);
595    }
596
597    #[test]
598    fn test_small_string() {
599        let small = SmallString::new("test").expect("construction should succeed");
600        assert_eq!(small.as_str(), "test");
601        assert_eq!(small.len(), 4);
602        assert!(!small.is_empty());
603
604        let empty = SmallString::new("").expect("construction should succeed");
605        assert!(empty.is_empty());
606
607        // Test too long string
608        let long_str = "this is a very long string that exceeds the maximum inline length";
609        assert!(SmallString::new(long_str).is_none());
610    }
611
612    #[test]
613    fn test_encode_named_node() {
614        let node = NamedNode::new("http://example.org/test").expect("valid IRI");
615        let encoded = EncodedTerm::encode_named_node(&node);
616
617        assert!(encoded.is_named_node());
618        assert!(!encoded.is_blank_node());
619        assert!(!encoded.is_literal());
620    }
621
622    #[test]
623    fn test_encode_blank_node() {
624        let node = BlankNode::new("test").expect("valid blank node id");
625        let encoded = EncodedTerm::encode_blank_node(&node);
626
627        assert!(!encoded.is_named_node());
628        assert!(encoded.is_blank_node());
629        assert!(!encoded.is_literal());
630    }
631
632    #[test]
633    fn test_encode_literal() {
634        // Simple string literal
635        let literal = Literal::new("test");
636        let encoded = EncodedTerm::encode_literal(&literal);
637        assert!(encoded.is_literal());
638        assert!(matches!(encoded, EncodedTerm::SmallStringLiteral(_)));
639
640        // Language-tagged literal
641        let literal = Literal::new_lang("test", "en").expect("construction should succeed");
642        let encoded = EncodedTerm::encode_literal(&literal);
643        assert!(encoded.is_literal());
644        assert!(matches!(
645            encoded,
646            EncodedTerm::SmallSmallLangStringLiteral { .. }
647        ));
648
649        // Typed literal
650        let literal = Literal::new_typed("42", xsd::INTEGER.clone());
651        let encoded = EncodedTerm::encode_literal(&literal);
652        assert!(encoded.is_literal());
653        assert!(matches!(encoded, EncodedTerm::SmallBigTypedLiteral { .. }));
654    }
655
656    #[test]
657    fn test_encoded_triple() {
658        let subject = EncodedTerm::encode_named_node(
659            &NamedNode::new("http://example.org/s").expect("valid IRI"),
660        );
661        let predicate = EncodedTerm::encode_named_node(
662            &NamedNode::new("http://example.org/p").expect("valid IRI"),
663        );
664        let object = EncodedTerm::encode_literal(&Literal::new("test"));
665
666        let triple = EncodedTriple::new(subject, predicate, object);
667        assert!(triple.size_hint() > 0);
668    }
669
670    #[test]
671    fn test_type_discriminant() {
672        let named_node = EncodedTerm::NamedNode {
673            iri_id: StrHash::new("http://example.org/test"),
674        };
675        let blank_node = EncodedTerm::SmallBlankNode(
676            SmallString::new("test").expect("construction should succeed"),
677        );
678        let literal = EncodedTerm::SmallStringLiteral(
679            SmallString::new("test").expect("construction should succeed"),
680        );
681
682        assert_eq!(named_node.type_discriminant(), 1);
683        assert_eq!(blank_node.type_discriminant(), 3);
684        assert_eq!(literal.type_discriminant(), 5);
685    }
686}