rdf_fusion_encoding/object_id/
encoding.rs

1use crate::EncodingName;
2use crate::encoding::TermEncoding;
3use crate::object_id::{ObjectIdArray, ObjectIdScalar};
4use datafusion::arrow::array::ArrayRef;
5use datafusion::arrow::datatypes::DataType;
6use datafusion::common::ScalarValue;
7use rdf_fusion_model::DFResult;
8use std::clone::Clone;
9use std::hash::Hash;
10
11/// The [ObjectIdEncoding] represents each distinct term in the database with a single unique id.
12/// We call such an id *object id*. Here is an example of the encoding:
13///
14/// ```text
15/// ?variable
16///
17///  ┌─────┐
18///  │   1 │ ────►  <#MyEntity>
19///  ├─────┤
20///  │   2 │ ────►  120^^xsd:integer
21///  ├─────┤
22///  │ ... │
23///  └─────┘
24/// ```
25///
26/// # Object ID Mapping
27///
28/// The mapping implementation depends on the storage layer that is being used. For example, an
29/// in-memory RDF store will use a different implementation as an on-disk RDF store. The
30/// [ObjectIdMapping](crate::object_id::ObjectIdMapping) trait defines the contract.
31///
32/// # Strengths and Weaknesses
33///
34/// The object id encoding is very well suited for evaluating joins, as instead of joining
35/// variable-length RDF terms, we can directly join the object ids. While we do not have recent
36/// numbers for the performance gains, the [original pull request](https://github.com/tobixdev/rdf-fusion/pull/27)
37/// quadrupled the performance of some queries (with relatively small datasets!).
38///
39/// However, this also introduces the necessity of decoding the object ids back to RDF terms. For
40/// example, by converting it to the [PlainTermEncoding](crate::plain_term::PlainTermEncoding).
41/// For queries that spend little time on join operations, the cost of decoding the object ids can
42/// outweigh the benefits of using the object id encoding.
43///
44/// Furthermore, the encoding introduces the necessity of maintaining the
45/// [ObjectIdMapping](crate::object_id::ObjectIdMapping), which can be non-trivial.
46///
47/// # Current Limitation
48///
49/// Currently, this id is fixed to being a 32-bit integer. However, we have an
50/// [issue](https://github.com/tobixdev/rdf-fusion/issues/50) that tracks the progress on limiting
51/// this limitation.
52#[derive(Debug, Clone, PartialEq, Eq, Hash)]
53pub struct ObjectIdEncoding {
54    /// The number of bytes in a single object id.
55    object_id_size: u8,
56}
57
58impl ObjectIdEncoding {
59    /// Creates a new [ObjectIdEncoding].
60    pub fn new(object_id_size: u8) -> Self {
61        Self { object_id_size }
62    }
63
64    /// Returns the size of the object id.
65    pub fn object_id_size(&self) -> u8 {
66        self.object_id_size
67    }
68}
69
70impl TermEncoding for ObjectIdEncoding {
71    type Array = ObjectIdArray;
72    type Scalar = ObjectIdScalar;
73
74    fn name(&self) -> EncodingName {
75        EncodingName::PlainTerm
76    }
77
78    fn data_type(&self) -> DataType {
79        DataType::UInt32
80    }
81
82    fn try_new_array(&self, array: ArrayRef) -> DFResult<Self::Array> {
83        ObjectIdArray::try_new(self.clone(), array)
84    }
85
86    fn try_new_scalar(&self, scalar: ScalarValue) -> DFResult<Self::Scalar> {
87        ObjectIdScalar::try_new(self.clone(), scalar)
88    }
89}