rdf_fusion_encoding/sortable_term/
encoding.rs

1use crate::encoding::TermEncoding;
2use crate::sortable_term::encoders::TermRefSortableTermEncoder;
3use crate::sortable_term::{SortableTermArray, SortableTermScalar};
4use crate::{EncodingArray, EncodingName, TermEncoder};
5use datafusion::arrow::array::ArrayRef;
6use datafusion::arrow::datatypes::{DataType, Field, Fields};
7use datafusion::common::ScalarValue;
8use rdf_fusion_model::DFResult;
9use rdf_fusion_model::{TermRef, ThinResult};
10use std::clone::Clone;
11use std::sync::LazyLock;
12
13/// Represents a sortable term encoding field.
14///
15/// This encoding is currently a work-around as user-defined orderings are not yet supported in
16/// DataFusion. The idea is to project a column of this type and then use the built-in ordering for
17/// structs to establish the SPARQL order.
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub(crate) enum SortableTermEncodingField {
20    /// Indicates the type of the encoded term. This is the first column and allows to separate the
21    /// ordering into the data types (e.g., blank nodes coming before named nodes).
22    Type,
23    /// Holds a Float64 representation of a possible numeric value. This can cause problems as some
24    /// values (e.g., Decimals) cannot be accurately represented using this approach. However, as we
25    /// hope that this is only a temporary solution, it is "good-enough" for now.
26    Numeric,
27    /// Holds bytes that are compared based on their byte values.
28    Bytes,
29}
30
31impl SortableTermEncodingField {
32    /// Get the name of the field.
33    pub fn name(self) -> &'static str {
34        match self {
35            SortableTermEncodingField::Type => "type",
36            SortableTermEncodingField::Numeric => "numeric",
37            SortableTermEncodingField::Bytes => "bytes",
38        }
39    }
40
41    /// Get the index in the struct from that field.
42    pub fn index(self) -> usize {
43        match self {
44            SortableTermEncodingField::Type => 0,
45            SortableTermEncodingField::Numeric => 1,
46            SortableTermEncodingField::Bytes => 2,
47        }
48    }
49
50    /// Get the [DataType] of this field.
51    pub fn data_type(self) -> DataType {
52        match self {
53            SortableTermEncodingField::Type => DataType::UInt8,
54            SortableTermEncodingField::Numeric => DataType::Float64,
55            SortableTermEncodingField::Bytes => DataType::Binary,
56        }
57    }
58}
59
60static FIELDS: LazyLock<Fields> = LazyLock::new(|| {
61    Fields::from(vec![
62        Field::new(
63            SortableTermEncodingField::Type.name(),
64            SortableTermEncodingField::Type.data_type(),
65            false,
66        ),
67        Field::new(
68            SortableTermEncodingField::Numeric.name(),
69            SortableTermEncodingField::Numeric.data_type(),
70            true,
71        ),
72        Field::new(
73            SortableTermEncodingField::Bytes.name(),
74            SortableTermEncodingField::Bytes.data_type(),
75            false,
76        ),
77    ])
78});
79
80/// The instance of the [SortableTermEncoding].
81///
82/// As there is currently no way to parameterize the encoding, accessing it via this constant is
83/// the preferred way.
84pub const SORTABLE_TERM_ENCODING: SortableTermEncoding = SortableTermEncoding;
85
86/// The sortable term encoding allows us to represent the expected SPARQL ordering using
87/// DataFusion's built-in ordering for structs.
88///
89/// This is meant as a work-around until we can define a custom ordering in DataFusion.
90/// Alternatively, we could also write a custom operator for sorting SPARQL solutions.
91#[derive(Debug)]
92pub struct SortableTermEncoding;
93
94impl SortableTermEncoding {
95    /// Returns the fields of this encoding.
96    pub fn fields() -> Fields {
97        FIELDS.clone()
98    }
99
100    /// Encodes the `term` as a [SortableTermScalar].
101    pub fn encode_term(
102        &self,
103        term: ThinResult<TermRef<'_>>,
104    ) -> DFResult<SortableTermScalar> {
105        TermRefSortableTermEncoder::encode_terms([term])?.try_as_scalar(0)
106    }
107}
108
109impl TermEncoding for SortableTermEncoding {
110    type Array = SortableTermArray;
111    type Scalar = SortableTermScalar;
112
113    fn name(&self) -> EncodingName {
114        EncodingName::Sortable
115    }
116
117    fn data_type(&self) -> DataType {
118        DataType::Struct(Self::fields().clone())
119    }
120
121    fn try_new_array(&self, array: ArrayRef) -> DFResult<Self::Array> {
122        array.try_into()
123    }
124
125    fn try_new_scalar(&self, scalar: ScalarValue) -> DFResult<Self::Scalar> {
126        scalar.try_into()
127    }
128}