rdf_fusion_encoding/object_id/encoding.rs
1use crate::EncodingName;
2use crate::encoding::TermEncoding;
3use crate::object_id::{ObjectIdArray, ObjectIdScalar};
4use datafusion::arrow::array::ArrayRef;
5use datafusion::arrow::datatypes::DataType;
6use datafusion::common::ScalarValue;
7use rdf_fusion_model::DFResult;
8use std::clone::Clone;
9use std::hash::Hash;
10
11/// The [ObjectIdEncoding] represents each distinct term in the database with a single unique id.
12/// We call such an id *object id*. Here is an example of the encoding:
13///
14/// ```text
15/// ?variable
16///
17/// ┌─────┐
18/// │ 1 │ ────► <#MyEntity>
19/// ├─────┤
20/// │ 2 │ ────► 120^^xsd:integer
21/// ├─────┤
22/// │ ... │
23/// └─────┘
24/// ```
25///
26/// # Object ID Mapping
27///
28/// The mapping implementation depends on the storage layer that is being used. For example, an
29/// in-memory RDF store will use a different implementation as an on-disk RDF store. The
30/// [ObjectIdMapping](crate::object_id::ObjectIdMapping) trait defines the contract.
31///
32/// # Strengths and Weaknesses
33///
34/// The object id encoding is very well suited for evaluating joins, as instead of joining
35/// variable-length RDF terms, we can directly join the object ids. While we do not have recent
36/// numbers for the performance gains, the [original pull request](https://github.com/tobixdev/rdf-fusion/pull/27)
37/// quadrupled the performance of some queries (with relatively small datasets!).
38///
39/// However, this also introduces the necessity of decoding the object ids back to RDF terms. For
40/// example, by converting it to the [PlainTermEncoding](crate::plain_term::PlainTermEncoding).
41/// For queries that spend little time on join operations, the cost of decoding the object ids can
42/// outweigh the benefits of using the object id encoding.
43///
44/// Furthermore, the encoding introduces the necessity of maintaining the
45/// [ObjectIdMapping](crate::object_id::ObjectIdMapping), which can be non-trivial.
46///
47/// # Current Limitation
48///
49/// Currently, this id is fixed to being a 32-bit integer. However, we have an
50/// [issue](https://github.com/tobixdev/rdf-fusion/issues/50) that tracks the progress on limiting
51/// this limitation.
52#[derive(Debug, Clone, PartialEq, Eq, Hash)]
53pub struct ObjectIdEncoding {
54 /// The number of bytes in a single object id.
55 object_id_size: u8,
56}
57
58impl ObjectIdEncoding {
59 /// Creates a new [ObjectIdEncoding].
60 pub fn new(object_id_size: u8) -> Self {
61 Self { object_id_size }
62 }
63
64 /// Returns the size of the object id.
65 pub fn object_id_size(&self) -> u8 {
66 self.object_id_size
67 }
68}
69
70impl TermEncoding for ObjectIdEncoding {
71 type Array = ObjectIdArray;
72 type Scalar = ObjectIdScalar;
73
74 fn name(&self) -> EncodingName {
75 EncodingName::PlainTerm
76 }
77
78 fn data_type(&self) -> DataType {
79 DataType::UInt32
80 }
81
82 fn try_new_array(&self, array: ArrayRef) -> DFResult<Self::Array> {
83 ObjectIdArray::try_new(self.clone(), array)
84 }
85
86 fn try_new_scalar(&self, scalar: ScalarValue) -> DFResult<Self::Scalar> {
87 ObjectIdScalar::try_new(self.clone(), scalar)
88 }
89}