rdf_fusion_encoding/
encoding.rs

1use crate::EncodingName;
2use datafusion::arrow::array::{Array, ArrayRef};
3use datafusion::arrow::datatypes::DataType;
4use datafusion::common::{ScalarValue, exec_err};
5use datafusion::logical_expr::ColumnarValue;
6use rdf_fusion_model::DFResult;
7use rdf_fusion_model::ThinResult;
8use std::fmt::Debug;
9
10/// Represents an Arrow [Array] with a specific [TermEncoding].
11///
12/// The constructors of types that implement [EncodingArray] are meant to ensure that the
13/// [ArrayRef] upholds all invariants of the encoding.
14pub trait EncodingArray: Clone {
15    /// The encoding used by this array.
16    type Encoding: TermEncoding;
17
18    /// Obtains the encoding instance for this array.
19    fn encoding(&self) -> &Self::Encoding;
20
21    /// Returns a reference to the inner array.
22    fn array(&self) -> &ArrayRef;
23
24    /// Consumes `self` and returns the inner array.
25    fn into_array(self) -> ArrayRef;
26
27    /// Extracts a scalar from this array at `index`.
28    ///
29    /// Returns an error if the `index` is out of bounds.
30    fn try_as_scalar(
31        &self,
32        index: usize,
33    ) -> DFResult<<Self::Encoding as TermEncoding>::Scalar> {
34        let scalar = ScalarValue::try_from_array(self.array(), index)?;
35        self.encoding().try_new_scalar(scalar)
36    }
37}
38
39/// Represents an Arrow [ScalarValue] with a specific [TermEncoding].
40///
41/// The constructors of types that implement [EncodingScalar] are meant to ensure that the
42/// [ScalarValue] upholds all invariants of the encoding.
43pub trait EncodingScalar {
44    /// The encoding used by this scalar.
45    type Encoding: TermEncoding;
46
47    /// Obtains the encoding instance for this scalar.
48    fn encoding(&self) -> &Self::Encoding;
49
50    /// Returns a reference to the inner scalar value.
51    fn scalar_value(&self) -> &ScalarValue;
52
53    /// Consumes `self` and returns the inner scalar value.
54    fn into_scalar_value(self) -> ScalarValue;
55
56    /// Produces a new array with `number_of_rows`.
57    fn to_array(
58        &self,
59        number_of_rows: usize,
60    ) -> DFResult<<Self::Encoding as TermEncoding>::Array> {
61        let array = self.scalar_value().to_array_of_size(number_of_rows)?;
62        self.encoding().try_new_array(array)
63    }
64}
65
66/// A term encoding defines how RDF terms are represented in Arrow arrays.
67///
68/// Each encoding defines a [DataType] that is uses for encoding RDF terms, while also having a
69/// wrapper [Self::Array] and [Self::Scalar] for Arrow arrays and scalars.
70///
71/// Different term encodings usually have different purposes and may only be valid for certain
72/// operations. For example, the [TypedValueEncoding](crate::typed_value::TypedValueEncoding) cannot
73/// be used to perform arbitrary join operations as it does not retain the lexical value of the RDF
74/// literals. On the other hand, the [TypedValueEncoding](crate::typed_value::TypedValueEncoding)
75/// will outperform the [PlainTermEncoding](crate::plain_term::PlainTermEncoding) for nested
76/// numerical operations as the parsing and validation of numeric literals is only done once.
77/// It is up to the user to ensure the correct use.
78pub trait TermEncoding: Debug + Send + Sync {
79    /// Represents a wrapper for Arrow arrays of this encoding. This can be used in
80    /// conjunction with [TermDecoder] to obtain the values from an Arrow array.
81    type Array: EncodingArray<Encoding = Self>;
82    /// Represents a wrapper for Arrow scalars of this encoding. This can be used in
83    /// conjunction with [TermDecoder] to obtain the values from an Arrow scalar.
84    type Scalar: EncodingScalar<Encoding = Self>;
85
86    /// Returns the name of the encoding.
87    fn name(&self) -> EncodingName;
88
89    /// Returns the [DataType] that is used for this encoding.
90    ///
91    /// This function depends on the instance of an encoding, as some encodings can be configured
92    /// such that the data type changes (at least in the future). Some encodings also expose a
93    /// statically known data type (e.g., [PlainTermEncoding::data_type](crate::plain_term::PlainTermEncoding::data_type)).
94    fn data_type(&self) -> DataType;
95
96    /// Checks whether `array` contains a value with the correct encoding (i.e., type and possibly
97    /// metadata checks). If yes, returns an instance of [Self::Array]. Otherwise, an error is
98    /// returned.
99    fn try_new_array(&self, array: ArrayRef) -> DFResult<Self::Array>;
100
101    /// Checks whether `scalar` contains a value with the correct encoding (i.e., type and possibly
102    /// metadata checks). If yes, returns an instance of [Self::Scalar]. Otherwise, an error is
103    /// returned.
104    fn try_new_scalar(&self, scalar: ScalarValue) -> DFResult<Self::Scalar>;
105
106    /// Checks whether `value` contains a value with the correct encoding (i.e., type and possibly
107    /// metadata checks). If yes, returns a datum that either wraps an array or a scalar. Otherwise,
108    /// an error is returned.
109    fn try_new_datum(
110        &self,
111        value: ColumnarValue,
112        number_rows: usize,
113    ) -> DFResult<EncodingDatum<Self>> {
114        let datum = match value {
115            ColumnarValue::Array(array) => {
116                if array.len() != number_rows {
117                    return exec_err!(
118                        "Unexpected array size. Expected {number_rows}, Actual: {}",
119                        array.len()
120                    );
121                }
122                EncodingDatum::Array(self.try_new_array(array)?)
123            }
124            ColumnarValue::Scalar(scalar) => {
125                EncodingDatum::Scalar(self.try_new_scalar(scalar)?, number_rows)
126            }
127        };
128        Ok(datum)
129    }
130}
131
132/// Allows extracting an iterator of a type from an [EncodingArray].
133///
134/// This allows uesrs to access the inner values of an RDF term array. It allows one to
135/// obtain a typed iterator over the RDF terms in the array. A decoder is specialized for one
136/// encoding and one value type ([Self::Term]).
137///
138/// ### Compatibility
139///
140/// Decoders are allowed to only support a subset of the encoded RDF terms. For example, a decoder
141/// for boolean values may produce an error if it encounters a literal with a different type.
142/// However, it is recommended that there is one decoder per [TermEncoding] that allows users to
143/// extract all RDF terms.
144///
145/// ### Performance
146///
147/// Using a [TermDecoder] for accessing the array, performing an operation on [Self::Term], and then
148/// re-encoding the resulting value using a [TermEncoder] may incur a performance penalty. However,
149/// we hope that this impact can be mitigated by compiler optimizations. We have yet to benchmark
150/// this impact to make a founded recommendation of when to use decoders and encoders. Users are
151/// free to directly work on the Arrow arrays to side-step the typed Encoding/Decoding machinery.
152pub trait TermDecoder<TEncoding: TermEncoding + ?Sized>: Debug + Sync + Send {
153    /// The resulting value type of decoding an RDF term.
154    type Term<'data>;
155
156    /// Allows extracting an iterator over all RDF terms in `array` that are _compatible_ with this
157    /// decoder (see [TermDecoder] for more information).
158    ///
159    /// The creation of the iterator cannot fail by itself, as the invariants of the encodings
160    /// should have been checked while creating `array`. However, the iterator may return an error
161    /// on every new value. This could be due to the value being incompatible with the decoder.
162    fn decode_terms(
163        array: &TEncoding::Array,
164    ) -> impl Iterator<Item = ThinResult<Self::Term<'_>>>;
165
166    /// Allows extracting an iterator over all RDF terms in `array` that are _compatible_ with this
167    /// decoder (see [TermDecoder] for more information).
168    ///
169    /// The creation of the value can fail if the value stored in the `scalar` is incompatible with
170    /// this decoder.
171    fn decode_term(scalar: &TEncoding::Scalar) -> ThinResult<Self::Term<'_>>;
172}
173
174/// Allows encoding an iterator of a type into an [EncodingArray].
175///
176/// This allows users to encode values in an RDF term array. An encoder is specialized for
177/// one encoding and one value type ([Self::Term]). The value type may only represent a subset of
178/// all valid RDF terms (e.g., only Boolean values). However, it is recommended that there is
179/// one decoder per [TermEncoding] that allows users to encode all RDF terms.
180///
181/// ### Performance
182///
183/// Using a [TermDecoder] for accessing the array, performing an operation on [Self::Term], and then
184/// re-encoding the resulting value using a [TermEncoder] may incur a performance penalty. However,
185/// we hope that this impact can be mitigated by compiler optimizations. We have yet to benchmark
186/// this impact to make a founded recommendation of when to use decoders and encoders. Users are
187/// free to directly work on the Arrow arrays to side-step the typed Encoding/Decoding machinery.
188pub trait TermEncoder<TEncoding: TermEncoding + ?Sized>: Debug + Sync + Send {
189    /// The value type that is being encoded.
190    type Term<'data>;
191
192    /// Allows encoding an iterator over RDF terms in an Arrow array.
193    fn encode_terms<'data>(
194        terms: impl IntoIterator<Item = ThinResult<Self::Term<'data>>>,
195    ) -> DFResult<TEncoding::Array>;
196
197    /// Allows encoding a scalar RDF term in an Arrow scalar.
198    fn encode_term(term: ThinResult<Self::Term<'_>>) -> DFResult<TEncoding::Scalar>;
199}
200
201/// Represents either an array or a scalar for a given encoding.
202///
203/// As the scalar variant also stores length information, one can obtain an iterator
204/// ([Self::term_iter]) independently on whether the underlying data is an array or a scalar. This
205/// is useful for scenarios in which distinguishing between array/scalar is not necessary or too
206/// complex.
207pub enum EncodingDatum<TEncoding: TermEncoding + ?Sized> {
208    /// An array underlies this datum.
209    Array(TEncoding::Array),
210    /// A scalar underlies this datum. The additional length value is crucial for creating an
211    /// iterator of a given length.
212    Scalar(TEncoding::Scalar, usize),
213}
214
215impl<TEncoding: TermEncoding + ?Sized> EncodingDatum<TEncoding> {
216    /// Creates an iterator over the contents of this datum.
217    ///
218    /// For an array, the iterator will simply return the result from the decoder.
219    ///
220    /// For a scalar, the value of the scalar will be cloned for each iteration, as dictated by the
221    /// additional length.
222    pub fn term_iter<'data, TDecoder>(
223        &'data self,
224    ) -> Box<dyn Iterator<Item = ThinResult<TDecoder::Term<'data>>> + 'data>
225    where
226        TDecoder: TermDecoder<TEncoding> + 'data,
227    {
228        match self {
229            EncodingDatum::Array(array) => Box::new(
230                TDecoder::decode_terms(array)
231                    .collect::<Vec<_>>()
232                    .into_iter(),
233            ),
234            EncodingDatum::Scalar(scalar, n) => {
235                Box::new((0..*n).map(|_| TDecoder::decode_term(scalar)))
236            }
237        }
238    }
239
240    /// Creates an array for this datum.
241    pub fn to_array(&self) -> TEncoding::Array {
242        match self {
243            EncodingDatum::Array(array) => array.clone(),
244            EncodingDatum::Scalar(scalar, number_rows) => {
245                scalar.to_array(*number_rows).unwrap()
246            }
247        }
248    }
249}