Skip to main content

oxirs_core/store/
binary.rs

1//! Binary encoding and decoding for RDF terms
2//!
3//! This implementation is extracted and adapted from Oxigraph's binary_encoder.rs
4//! to provide zero-dependency binary serialization with optimal storage efficiency.
5
6use crate::store::encoding::{EncodedQuad, EncodedTerm, SmallString, StrHash};
7use crate::OxirsError;
8use std::io::{Cursor, Read};
9use std::mem::size_of;
10
11/// Maximum size of an encoded term in bytes
12pub const WRITTEN_TERM_MAX_SIZE: usize = size_of::<u8>() + 2 * size_of::<StrHash>();
13
14// Encoded term type constants
15const TYPE_DEFAULT_GRAPH: u8 = 0;
16const TYPE_NAMED_NODE_ID: u8 = 1;
17const TYPE_NUMERICAL_BLANK_NODE_ID: u8 = 8;
18const TYPE_SMALL_BLANK_NODE_ID: u8 = 9;
19const TYPE_BIG_BLANK_NODE_ID: u8 = 10;
20const TYPE_SMALL_STRING_LITERAL: u8 = 16;
21const TYPE_BIG_STRING_LITERAL: u8 = 17;
22const TYPE_SMALL_SMALL_LANG_STRING_LITERAL: u8 = 20;
23const TYPE_SMALL_BIG_LANG_STRING_LITERAL: u8 = 21;
24const TYPE_BIG_SMALL_LANG_STRING_LITERAL: u8 = 22;
25const TYPE_BIG_BIG_LANG_STRING_LITERAL: u8 = 23;
26const TYPE_SMALL_SMALL_TYPED_LITERAL: u8 = 24;
27const TYPE_SMALL_BIG_TYPED_LITERAL: u8 = 25;
28const TYPE_BIG_SMALL_TYPED_LITERAL: u8 = 26;
29const TYPE_BIG_BIG_TYPED_LITERAL: u8 = 27;
30const TYPE_QUOTED_TRIPLE: u8 = 30;
31
32/// Quad encoding variations for different sort orders
33#[derive(Clone, Copy, Debug)]
34pub enum QuadEncoding {
35    /// Subject, Predicate, Object, Graph
36    Spog,
37    /// Predicate, Object, Subject, Graph
38    Posg,
39    /// Object, Subject, Predicate, Graph
40    Ospg,
41    /// Graph, Subject, Predicate, Object
42    Gspo,
43    /// Graph, Predicate, Object, Subject
44    Gpos,
45    /// Graph, Object, Subject, Predicate
46    Gosp,
47}
48
49impl QuadEncoding {
50    /// Decodes a quad from a buffer according to this encoding
51    pub fn decode(self, buffer: &[u8]) -> Result<EncodedQuad, OxirsError> {
52        let mut cursor = Cursor::new(buffer);
53        match self {
54            Self::Spog => decode_spog_quad(&mut cursor),
55            Self::Posg => decode_posg_quad(&mut cursor),
56            Self::Ospg => decode_ospg_quad(&mut cursor),
57            Self::Gspo => decode_gspo_quad(&mut cursor),
58            Self::Gpos => decode_gpos_quad(&mut cursor),
59            Self::Gosp => decode_gosp_quad(&mut cursor),
60        }
61    }
62
63    /// Encodes a quad to a buffer according to this encoding
64    pub fn encode(self, quad: &EncodedQuad, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
65        match self {
66            Self::Spog => encode_spog_quad(quad, buffer),
67            Self::Posg => encode_posg_quad(quad, buffer),
68            Self::Ospg => encode_ospg_quad(quad, buffer),
69            Self::Gspo => encode_gspo_quad(quad, buffer),
70            Self::Gpos => encode_gpos_quad(quad, buffer),
71            Self::Gosp => encode_gosp_quad(quad, buffer),
72        }
73    }
74}
75
76/// Encodes a term to a binary representation
77pub fn encode_term(term: &EncodedTerm, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
78    match term {
79        EncodedTerm::DefaultGraph => {
80            buffer.push(TYPE_DEFAULT_GRAPH);
81        }
82        EncodedTerm::NamedNode { iri_id } => {
83            buffer.push(TYPE_NAMED_NODE_ID);
84            buffer.extend_from_slice(&iri_id.to_be_bytes());
85        }
86        EncodedTerm::NumericalBlankNode { id } => {
87            buffer.push(TYPE_NUMERICAL_BLANK_NODE_ID);
88            buffer.extend_from_slice(id);
89        }
90        EncodedTerm::SmallBlankNode(id) => {
91            buffer.push(TYPE_SMALL_BLANK_NODE_ID);
92            encode_small_string(id, buffer);
93        }
94        EncodedTerm::BigBlankNode { id_id } => {
95            buffer.push(TYPE_BIG_BLANK_NODE_ID);
96            buffer.extend_from_slice(&id_id.to_be_bytes());
97        }
98        EncodedTerm::SmallStringLiteral(value) => {
99            buffer.push(TYPE_SMALL_STRING_LITERAL);
100            encode_small_string(value, buffer);
101        }
102        EncodedTerm::BigStringLiteral { value_id } => {
103            buffer.push(TYPE_BIG_STRING_LITERAL);
104            buffer.extend_from_slice(&value_id.to_be_bytes());
105        }
106        EncodedTerm::SmallSmallLangStringLiteral { value, language } => {
107            buffer.push(TYPE_SMALL_SMALL_LANG_STRING_LITERAL);
108            encode_small_string(value, buffer);
109            encode_small_string(language, buffer);
110        }
111        EncodedTerm::SmallBigLangStringLiteral { value, language_id } => {
112            buffer.push(TYPE_SMALL_BIG_LANG_STRING_LITERAL);
113            encode_small_string(value, buffer);
114            buffer.extend_from_slice(&language_id.to_be_bytes());
115        }
116        EncodedTerm::BigSmallLangStringLiteral { value_id, language } => {
117            buffer.push(TYPE_BIG_SMALL_LANG_STRING_LITERAL);
118            buffer.extend_from_slice(&value_id.to_be_bytes());
119            encode_small_string(language, buffer);
120        }
121        EncodedTerm::BigBigLangStringLiteral {
122            value_id,
123            language_id,
124        } => {
125            buffer.push(TYPE_BIG_BIG_LANG_STRING_LITERAL);
126            buffer.extend_from_slice(&value_id.to_be_bytes());
127            buffer.extend_from_slice(&language_id.to_be_bytes());
128        }
129        EncodedTerm::SmallSmallTypedLiteral { value, datatype } => {
130            buffer.push(TYPE_SMALL_SMALL_TYPED_LITERAL);
131            encode_small_string(value, buffer);
132            encode_small_string(datatype, buffer);
133        }
134        EncodedTerm::SmallBigTypedLiteral { value, datatype_id } => {
135            buffer.push(TYPE_SMALL_BIG_TYPED_LITERAL);
136            encode_small_string(value, buffer);
137            buffer.extend_from_slice(&datatype_id.to_be_bytes());
138        }
139        EncodedTerm::BigSmallTypedLiteral { value_id, datatype } => {
140            buffer.push(TYPE_BIG_SMALL_TYPED_LITERAL);
141            buffer.extend_from_slice(&value_id.to_be_bytes());
142            encode_small_string(datatype, buffer);
143        }
144        EncodedTerm::BigBigTypedLiteral {
145            value_id,
146            datatype_id,
147        } => {
148            buffer.push(TYPE_BIG_BIG_TYPED_LITERAL);
149            buffer.extend_from_slice(&value_id.to_be_bytes());
150            buffer.extend_from_slice(&datatype_id.to_be_bytes());
151        }
152        EncodedTerm::QuotedTriple {
153            subject,
154            predicate,
155            object,
156        } => {
157            buffer.push(TYPE_QUOTED_TRIPLE);
158            encode_term(subject, buffer)?;
159            encode_term(predicate, buffer)?;
160            encode_term(object, buffer)?;
161        }
162    }
163    Ok(())
164}
165
166/// Decodes a term from a binary representation
167pub fn decode_term(buffer: &mut Cursor<&[u8]>) -> Result<EncodedTerm, OxirsError> {
168    let mut type_byte = [0u8; 1];
169    buffer
170        .read_exact(&mut type_byte)
171        .map_err(|e| OxirsError::Store(format!("Failed to read type byte: {e}")))?;
172
173    match type_byte[0] {
174        TYPE_DEFAULT_GRAPH => Ok(EncodedTerm::DefaultGraph),
175        TYPE_NAMED_NODE_ID => {
176            let iri_id = read_str_hash(buffer)?;
177            Ok(EncodedTerm::NamedNode { iri_id })
178        }
179        TYPE_NUMERICAL_BLANK_NODE_ID => {
180            let mut id = [0u8; 16];
181            buffer
182                .read_exact(&mut id)
183                .map_err(|e| OxirsError::Store(format!("Failed to read blank node ID: {e}")))?;
184            Ok(EncodedTerm::NumericalBlankNode { id })
185        }
186        TYPE_SMALL_BLANK_NODE_ID => {
187            let id = decode_small_string(buffer)?;
188            Ok(EncodedTerm::SmallBlankNode(id))
189        }
190        TYPE_BIG_BLANK_NODE_ID => {
191            let id_id = read_str_hash(buffer)?;
192            Ok(EncodedTerm::BigBlankNode { id_id })
193        }
194        TYPE_SMALL_STRING_LITERAL => {
195            let value = decode_small_string(buffer)?;
196            Ok(EncodedTerm::SmallStringLiteral(value))
197        }
198        TYPE_BIG_STRING_LITERAL => {
199            let value_id = read_str_hash(buffer)?;
200            Ok(EncodedTerm::BigStringLiteral { value_id })
201        }
202        TYPE_SMALL_SMALL_LANG_STRING_LITERAL => {
203            let value = decode_small_string(buffer)?;
204            let language = decode_small_string(buffer)?;
205            Ok(EncodedTerm::SmallSmallLangStringLiteral { value, language })
206        }
207        TYPE_SMALL_BIG_LANG_STRING_LITERAL => {
208            let value = decode_small_string(buffer)?;
209            let language_id = read_str_hash(buffer)?;
210            Ok(EncodedTerm::SmallBigLangStringLiteral { value, language_id })
211        }
212        TYPE_BIG_SMALL_LANG_STRING_LITERAL => {
213            let value_id = read_str_hash(buffer)?;
214            let language = decode_small_string(buffer)?;
215            Ok(EncodedTerm::BigSmallLangStringLiteral { value_id, language })
216        }
217        TYPE_BIG_BIG_LANG_STRING_LITERAL => {
218            let value_id = read_str_hash(buffer)?;
219            let language_id = read_str_hash(buffer)?;
220            Ok(EncodedTerm::BigBigLangStringLiteral {
221                value_id,
222                language_id,
223            })
224        }
225        TYPE_SMALL_SMALL_TYPED_LITERAL => {
226            let value = decode_small_string(buffer)?;
227            let datatype = decode_small_string(buffer)?;
228            Ok(EncodedTerm::SmallSmallTypedLiteral { value, datatype })
229        }
230        TYPE_SMALL_BIG_TYPED_LITERAL => {
231            let value = decode_small_string(buffer)?;
232            let datatype_id = read_str_hash(buffer)?;
233            Ok(EncodedTerm::SmallBigTypedLiteral { value, datatype_id })
234        }
235        TYPE_BIG_SMALL_TYPED_LITERAL => {
236            let value_id = read_str_hash(buffer)?;
237            let datatype = decode_small_string(buffer)?;
238            Ok(EncodedTerm::BigSmallTypedLiteral { value_id, datatype })
239        }
240        TYPE_BIG_BIG_TYPED_LITERAL => {
241            let value_id = read_str_hash(buffer)?;
242            let datatype_id = read_str_hash(buffer)?;
243            Ok(EncodedTerm::BigBigTypedLiteral {
244                value_id,
245                datatype_id,
246            })
247        }
248        TYPE_QUOTED_TRIPLE => {
249            let subject = Box::new(decode_term(buffer)?);
250            let predicate = Box::new(decode_term(buffer)?);
251            let object = Box::new(decode_term(buffer)?);
252            Ok(EncodedTerm::QuotedTriple {
253                subject,
254                predicate,
255                object,
256            })
257        }
258        type_byte => Err(OxirsError::Store(format!(
259            "Unknown encoded term type: {type_byte}"
260        ))),
261    }
262}
263
264/// Encodes a small string
265fn encode_small_string(small_string: &SmallString, buffer: &mut Vec<u8>) {
266    buffer.push(small_string.len() as u8);
267    buffer.extend_from_slice(small_string.as_str().as_bytes());
268}
269
270/// Decodes a small string
271fn decode_small_string(buffer: &mut Cursor<&[u8]>) -> Result<SmallString, OxirsError> {
272    let mut len_byte = [0u8; 1];
273    buffer
274        .read_exact(&mut len_byte)
275        .map_err(|e| OxirsError::Store(format!("Failed to read string length: {e}")))?;
276
277    let len = len_byte[0] as usize;
278    if len > 15 {
279        return Err(OxirsError::Store(format!(
280            "SmallString length {len} exceeds maximum of 15"
281        )));
282    }
283
284    let mut data = [0u8; 16];
285    if len > 0 {
286        buffer
287            .read_exact(&mut data[..len])
288            .map_err(|e| OxirsError::Store(format!("Failed to read string data: {e}")))?;
289    }
290
291    let s = std::str::from_utf8(&data[..len])
292        .map_err(|e| OxirsError::Store(format!("Invalid UTF-8 in small string: {e}")))?;
293
294    SmallString::new(s)
295        .ok_or_else(|| OxirsError::Store("String too long for SmallString".to_string()))
296}
297
298/// Reads a StrHash from the buffer
299fn read_str_hash(buffer: &mut Cursor<&[u8]>) -> Result<StrHash, OxirsError> {
300    let mut hash_bytes = [0u8; 16];
301    buffer
302        .read_exact(&mut hash_bytes)
303        .map_err(|e| OxirsError::Store(format!("Failed to read StrHash: {e}")))?;
304    Ok(StrHash::from_be_bytes(hash_bytes))
305}
306
307// Quad encoding functions
308
309fn encode_spog_quad(quad: &EncodedQuad, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
310    encode_term(&quad.subject, buffer)?;
311    encode_term(&quad.predicate, buffer)?;
312    encode_term(&quad.object, buffer)?;
313    encode_term(&quad.graph_name, buffer)
314}
315
316fn decode_spog_quad(cursor: &mut Cursor<&[u8]>) -> Result<EncodedQuad, OxirsError> {
317    let subject = decode_term(cursor)?;
318    let predicate = decode_term(cursor)?;
319    let object = decode_term(cursor)?;
320    let graph_name = decode_term(cursor)?;
321    Ok(EncodedQuad::new(subject, predicate, object, graph_name))
322}
323
324fn encode_posg_quad(quad: &EncodedQuad, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
325    encode_term(&quad.predicate, buffer)?;
326    encode_term(&quad.object, buffer)?;
327    encode_term(&quad.subject, buffer)?;
328    encode_term(&quad.graph_name, buffer)
329}
330
331fn decode_posg_quad(cursor: &mut Cursor<&[u8]>) -> Result<EncodedQuad, OxirsError> {
332    let predicate = decode_term(cursor)?;
333    let object = decode_term(cursor)?;
334    let subject = decode_term(cursor)?;
335    let graph_name = decode_term(cursor)?;
336    Ok(EncodedQuad::new(subject, predicate, object, graph_name))
337}
338
339fn encode_ospg_quad(quad: &EncodedQuad, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
340    encode_term(&quad.object, buffer)?;
341    encode_term(&quad.subject, buffer)?;
342    encode_term(&quad.predicate, buffer)?;
343    encode_term(&quad.graph_name, buffer)
344}
345
346fn decode_ospg_quad(cursor: &mut Cursor<&[u8]>) -> Result<EncodedQuad, OxirsError> {
347    let object = decode_term(cursor)?;
348    let subject = decode_term(cursor)?;
349    let predicate = decode_term(cursor)?;
350    let graph_name = decode_term(cursor)?;
351    Ok(EncodedQuad::new(subject, predicate, object, graph_name))
352}
353
354fn encode_gspo_quad(quad: &EncodedQuad, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
355    encode_term(&quad.graph_name, buffer)?;
356    encode_term(&quad.subject, buffer)?;
357    encode_term(&quad.predicate, buffer)?;
358    encode_term(&quad.object, buffer)
359}
360
361fn decode_gspo_quad(cursor: &mut Cursor<&[u8]>) -> Result<EncodedQuad, OxirsError> {
362    let graph_name = decode_term(cursor)?;
363    let subject = decode_term(cursor)?;
364    let predicate = decode_term(cursor)?;
365    let object = decode_term(cursor)?;
366    Ok(EncodedQuad::new(subject, predicate, object, graph_name))
367}
368
369fn encode_gpos_quad(quad: &EncodedQuad, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
370    encode_term(&quad.graph_name, buffer)?;
371    encode_term(&quad.predicate, buffer)?;
372    encode_term(&quad.object, buffer)?;
373    encode_term(&quad.subject, buffer)
374}
375
376fn decode_gpos_quad(cursor: &mut Cursor<&[u8]>) -> Result<EncodedQuad, OxirsError> {
377    let graph_name = decode_term(cursor)?;
378    let predicate = decode_term(cursor)?;
379    let object = decode_term(cursor)?;
380    let subject = decode_term(cursor)?;
381    Ok(EncodedQuad::new(subject, predicate, object, graph_name))
382}
383
384fn encode_gosp_quad(quad: &EncodedQuad, buffer: &mut Vec<u8>) -> Result<(), OxirsError> {
385    encode_term(&quad.graph_name, buffer)?;
386    encode_term(&quad.object, buffer)?;
387    encode_term(&quad.subject, buffer)?;
388    encode_term(&quad.predicate, buffer)
389}
390
391fn decode_gosp_quad(cursor: &mut Cursor<&[u8]>) -> Result<EncodedQuad, OxirsError> {
392    let graph_name = decode_term(cursor)?;
393    let object = decode_term(cursor)?;
394    let subject = decode_term(cursor)?;
395    let predicate = decode_term(cursor)?;
396    Ok(EncodedQuad::new(subject, predicate, object, graph_name))
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402
403    #[test]
404    fn test_term_encoding_roundtrip() {
405        let terms = vec![
406            EncodedTerm::DefaultGraph,
407            EncodedTerm::NamedNode {
408                iri_id: StrHash::new("http://example.org/test"),
409            },
410            EncodedTerm::SmallBlankNode(
411                SmallString::new("test").expect("construction should succeed"),
412            ),
413            EncodedTerm::SmallStringLiteral(
414                SmallString::new("hello").expect("construction should succeed"),
415            ),
416            EncodedTerm::SmallSmallLangStringLiteral {
417                value: SmallString::new("hello").expect("construction should succeed"),
418                language: SmallString::new("en").expect("construction should succeed"),
419            },
420        ];
421
422        for term in terms {
423            let mut buffer = Vec::new();
424            encode_term(&term, &mut buffer).expect("term encoding should succeed");
425
426            let mut cursor = Cursor::new(buffer.as_slice());
427            let decoded = decode_term(&mut cursor).expect("term decoding should succeed");
428
429            assert_eq!(term, decoded);
430        }
431    }
432
433    #[test]
434    fn test_quad_encoding_roundtrip() {
435        let quad = EncodedQuad::new(
436            EncodedTerm::NamedNode {
437                iri_id: StrHash::new("http://example.org/s"),
438            },
439            EncodedTerm::NamedNode {
440                iri_id: StrHash::new("http://example.org/p"),
441            },
442            EncodedTerm::SmallStringLiteral(
443                SmallString::new("object").expect("construction should succeed"),
444            ),
445            EncodedTerm::DefaultGraph,
446        );
447
448        let encodings = [
449            QuadEncoding::Spog,
450            QuadEncoding::Posg,
451            QuadEncoding::Ospg,
452            QuadEncoding::Gspo,
453            QuadEncoding::Gpos,
454            QuadEncoding::Gosp,
455        ];
456
457        for encoding in &encodings {
458            let mut buffer = Vec::new();
459            encoding
460                .encode(&quad, &mut buffer)
461                .expect("encoding should succeed");
462
463            let decoded = encoding.decode(&buffer).expect("decoding should succeed");
464            assert_eq!(quad, decoded);
465        }
466    }
467
468    #[test]
469    fn test_small_string_encoding() {
470        let strings = ["", "test", "hello world", "emoji🚀"];
471
472        for s in &strings {
473            if let Some(small_string) = SmallString::new(s) {
474                let mut buffer = Vec::new();
475                encode_small_string(&small_string, &mut buffer);
476
477                let mut cursor = Cursor::new(buffer.as_slice());
478                let decoded =
479                    decode_small_string(&mut cursor).expect("string decoding should succeed");
480
481                assert_eq!(small_string.as_str(), decoded.as_str());
482            }
483        }
484    }
485
486    #[test]
487    fn test_str_hash_encoding() {
488        let hash = StrHash::new("http://example.org/test");
489        let bytes = hash.to_be_bytes();
490        let reconstructed = StrHash::from_be_bytes(bytes);
491        assert_eq!(hash, reconstructed);
492    }
493}