rdf_fusion_encoding/encoding.rs
1use crate::EncodingName;
2use datafusion::arrow::array::{Array, ArrayRef};
3use datafusion::arrow::datatypes::DataType;
4use datafusion::common::{ScalarValue, exec_err};
5use datafusion::logical_expr::ColumnarValue;
6use rdf_fusion_model::DFResult;
7use rdf_fusion_model::ThinResult;
8use std::fmt::Debug;
9
10/// Represents an Arrow [Array] with a specific [TermEncoding].
11///
12/// The constructors of types that implement [EncodingArray] are meant to ensure that the
13/// [ArrayRef] upholds all invariants of the encoding.
14pub trait EncodingArray: Clone {
15 /// The encoding used by this array.
16 type Encoding: TermEncoding;
17
18 /// Obtains the encoding instance for this array.
19 fn encoding(&self) -> &Self::Encoding;
20
21 /// Returns a reference to the inner array.
22 fn array(&self) -> &ArrayRef;
23
24 /// Consumes `self` and returns the inner array.
25 fn into_array(self) -> ArrayRef;
26
27 /// Extracts a scalar from this array at `index`.
28 ///
29 /// Returns an error if the `index` is out of bounds.
30 fn try_as_scalar(
31 &self,
32 index: usize,
33 ) -> DFResult<<Self::Encoding as TermEncoding>::Scalar> {
34 let scalar = ScalarValue::try_from_array(self.array(), index)?;
35 self.encoding().try_new_scalar(scalar)
36 }
37}
38
39/// Represents an Arrow [ScalarValue] with a specific [TermEncoding].
40///
41/// The constructors of types that implement [EncodingScalar] are meant to ensure that the
42/// [ScalarValue] upholds all invariants of the encoding.
43pub trait EncodingScalar {
44 /// The encoding used by this scalar.
45 type Encoding: TermEncoding;
46
47 /// Obtains the encoding instance for this scalar.
48 fn encoding(&self) -> &Self::Encoding;
49
50 /// Returns a reference to the inner scalar value.
51 fn scalar_value(&self) -> &ScalarValue;
52
53 /// Consumes `self` and returns the inner scalar value.
54 fn into_scalar_value(self) -> ScalarValue;
55
56 /// Produces a new array with `number_of_rows`.
57 fn to_array(
58 &self,
59 number_of_rows: usize,
60 ) -> DFResult<<Self::Encoding as TermEncoding>::Array> {
61 let array = self.scalar_value().to_array_of_size(number_of_rows)?;
62 self.encoding().try_new_array(array)
63 }
64}
65
66/// A term encoding defines how RDF terms are represented in Arrow arrays.
67///
68/// Each encoding defines a [DataType] that is uses for encoding RDF terms, while also having a
69/// wrapper [Self::Array] and [Self::Scalar] for Arrow arrays and scalars.
70///
71/// Different term encodings usually have different purposes and may only be valid for certain
72/// operations. For example, the [TypedValueEncoding](crate::typed_value::TypedValueEncoding) cannot
73/// be used to perform arbitrary join operations as it does not retain the lexical value of the RDF
74/// literals. On the other hand, the [TypedValueEncoding](crate::typed_value::TypedValueEncoding)
75/// will outperform the [PlainTermEncoding](crate::plain_term::PlainTermEncoding) for nested
76/// numerical operations as the parsing and validation of numeric literals is only done once.
77/// It is up to the user to ensure the correct use.
78pub trait TermEncoding: Debug + Send + Sync {
79 /// Represents a wrapper for Arrow arrays of this encoding. This can be used in
80 /// conjunction with [TermDecoder] to obtain the values from an Arrow array.
81 type Array: EncodingArray<Encoding = Self>;
82 /// Represents a wrapper for Arrow scalars of this encoding. This can be used in
83 /// conjunction with [TermDecoder] to obtain the values from an Arrow scalar.
84 type Scalar: EncodingScalar<Encoding = Self>;
85
86 /// Returns the name of the encoding.
87 fn name(&self) -> EncodingName;
88
89 /// Returns the [DataType] that is used for this encoding.
90 ///
91 /// This function depends on the instance of an encoding, as some encodings can be configured
92 /// such that the data type changes (at least in the future). Some encodings also expose a
93 /// statically known data type (e.g., [PlainTermEncoding::data_type](crate::plain_term::PlainTermEncoding::data_type)).
94 fn data_type(&self) -> DataType;
95
96 /// Checks whether `array` contains a value with the correct encoding (i.e., type and possibly
97 /// metadata checks). If yes, returns an instance of [Self::Array]. Otherwise, an error is
98 /// returned.
99 fn try_new_array(&self, array: ArrayRef) -> DFResult<Self::Array>;
100
101 /// Checks whether `scalar` contains a value with the correct encoding (i.e., type and possibly
102 /// metadata checks). If yes, returns an instance of [Self::Scalar]. Otherwise, an error is
103 /// returned.
104 fn try_new_scalar(&self, scalar: ScalarValue) -> DFResult<Self::Scalar>;
105
106 /// Checks whether `value` contains a value with the correct encoding (i.e., type and possibly
107 /// metadata checks). If yes, returns a datum that either wraps an array or a scalar. Otherwise,
108 /// an error is returned.
109 fn try_new_datum(
110 &self,
111 value: ColumnarValue,
112 number_rows: usize,
113 ) -> DFResult<EncodingDatum<Self>> {
114 let datum = match value {
115 ColumnarValue::Array(array) => {
116 if array.len() != number_rows {
117 return exec_err!(
118 "Unexpected array size. Expected {number_rows}, Actual: {}",
119 array.len()
120 );
121 }
122 EncodingDatum::Array(self.try_new_array(array)?)
123 }
124 ColumnarValue::Scalar(scalar) => {
125 EncodingDatum::Scalar(self.try_new_scalar(scalar)?, number_rows)
126 }
127 };
128 Ok(datum)
129 }
130}
131
132/// Allows extracting an iterator of a type from an [EncodingArray].
133///
134/// This allows uesrs to access the inner values of an RDF term array. It allows one to
135/// obtain a typed iterator over the RDF terms in the array. A decoder is specialized for one
136/// encoding and one value type ([Self::Term]).
137///
138/// ### Compatibility
139///
140/// Decoders are allowed to only support a subset of the encoded RDF terms. For example, a decoder
141/// for boolean values may produce an error if it encounters a literal with a different type.
142/// However, it is recommended that there is one decoder per [TermEncoding] that allows users to
143/// extract all RDF terms.
144///
145/// ### Performance
146///
147/// Using a [TermDecoder] for accessing the array, performing an operation on [Self::Term], and then
148/// re-encoding the resulting value using a [TermEncoder] may incur a performance penalty. However,
149/// we hope that this impact can be mitigated by compiler optimizations. We have yet to benchmark
150/// this impact to make a founded recommendation of when to use decoders and encoders. Users are
151/// free to directly work on the Arrow arrays to side-step the typed Encoding/Decoding machinery.
152pub trait TermDecoder<TEncoding: TermEncoding + ?Sized>: Debug + Sync + Send {
153 /// The resulting value type of decoding an RDF term.
154 type Term<'data>;
155
156 /// Allows extracting an iterator over all RDF terms in `array` that are _compatible_ with this
157 /// decoder (see [TermDecoder] for more information).
158 ///
159 /// The creation of the iterator cannot fail by itself, as the invariants of the encodings
160 /// should have been checked while creating `array`. However, the iterator may return an error
161 /// on every new value. This could be due to the value being incompatible with the decoder.
162 fn decode_terms(
163 array: &TEncoding::Array,
164 ) -> impl Iterator<Item = ThinResult<Self::Term<'_>>>;
165
166 /// Allows extracting an iterator over all RDF terms in `array` that are _compatible_ with this
167 /// decoder (see [TermDecoder] for more information).
168 ///
169 /// The creation of the value can fail if the value stored in the `scalar` is incompatible with
170 /// this decoder.
171 fn decode_term(scalar: &TEncoding::Scalar) -> ThinResult<Self::Term<'_>>;
172}
173
174/// Allows encoding an iterator of a type into an [EncodingArray].
175///
176/// This allows users to encode values in an RDF term array. An encoder is specialized for
177/// one encoding and one value type ([Self::Term]). The value type may only represent a subset of
178/// all valid RDF terms (e.g., only Boolean values). However, it is recommended that there is
179/// one decoder per [TermEncoding] that allows users to encode all RDF terms.
180///
181/// ### Performance
182///
183/// Using a [TermDecoder] for accessing the array, performing an operation on [Self::Term], and then
184/// re-encoding the resulting value using a [TermEncoder] may incur a performance penalty. However,
185/// we hope that this impact can be mitigated by compiler optimizations. We have yet to benchmark
186/// this impact to make a founded recommendation of when to use decoders and encoders. Users are
187/// free to directly work on the Arrow arrays to side-step the typed Encoding/Decoding machinery.
188pub trait TermEncoder<TEncoding: TermEncoding + ?Sized>: Debug + Sync + Send {
189 /// The value type that is being encoded.
190 type Term<'data>;
191
192 /// Allows encoding an iterator over RDF terms in an Arrow array.
193 fn encode_terms<'data>(
194 terms: impl IntoIterator<Item = ThinResult<Self::Term<'data>>>,
195 ) -> DFResult<TEncoding::Array>;
196
197 /// Allows encoding a scalar RDF term in an Arrow scalar.
198 fn encode_term(term: ThinResult<Self::Term<'_>>) -> DFResult<TEncoding::Scalar>;
199}
200
201/// Represents either an array or a scalar for a given encoding.
202///
203/// As the scalar variant also stores length information, one can obtain an iterator
204/// ([Self::term_iter]) independently on whether the underlying data is an array or a scalar. This
205/// is useful for scenarios in which distinguishing between array/scalar is not necessary or too
206/// complex.
207pub enum EncodingDatum<TEncoding: TermEncoding + ?Sized> {
208 /// An array underlies this datum.
209 Array(TEncoding::Array),
210 /// A scalar underlies this datum. The additional length value is crucial for creating an
211 /// iterator of a given length.
212 Scalar(TEncoding::Scalar, usize),
213}
214
215impl<TEncoding: TermEncoding + ?Sized> EncodingDatum<TEncoding> {
216 /// Creates an iterator over the contents of this datum.
217 ///
218 /// For an array, the iterator will simply return the result from the decoder.
219 ///
220 /// For a scalar, the value of the scalar will be cloned for each iteration, as dictated by the
221 /// additional length.
222 pub fn term_iter<'data, TDecoder>(
223 &'data self,
224 ) -> Box<dyn Iterator<Item = ThinResult<TDecoder::Term<'data>>> + 'data>
225 where
226 TDecoder: TermDecoder<TEncoding> + 'data,
227 {
228 match self {
229 EncodingDatum::Array(array) => Box::new(
230 TDecoder::decode_terms(array)
231 .collect::<Vec<_>>()
232 .into_iter(),
233 ),
234 EncodingDatum::Scalar(scalar, n) => {
235 Box::new((0..*n).map(|_| TDecoder::decode_term(scalar)))
236 }
237 }
238 }
239
240 /// Creates an array for this datum.
241 pub fn to_array(&self) -> TEncoding::Array {
242 match self {
243 EncodingDatum::Array(array) => array.clone(),
244 EncodingDatum::Scalar(scalar, number_rows) => {
245 scalar.to_array(*number_rows).unwrap()
246 }
247 }
248 }
249}