vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::{VortexExpect, VortexResult, vortex_bail};
8
9use crate::arrays::{
10    BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray,
11    VarBinViewArray, compact_buffers,
12};
13use crate::builders::builder_with_capacity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// An enum capturing the default uncompressed encodings for each [Vortex type][DType].
17///
18/// Any array can be decoded into canonical form via the [`to_canonical`][Array::to_canonical]
19/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20/// contain compressed child arrays.
21///
22/// Canonical form is useful for doing type-specific compute where you need to know that all
23/// elements are laid out decompressed and contiguous in memory.
24///
25/// # Laziness
26///
27/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
28/// `Struct` type, individual column child arrays may still be compressed. This allows
29/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
30/// never need to be decoded into canonical form at all depending on the compute.
31///
32/// # Arrow interoperability
33///
34/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
35/// zero-copy, and the corresponding Arrow array types can also be built directly.
36///
37/// The full list of canonical types and their equivalent Arrow array types are:
38///
39/// * `NullArray`: [`arrow_array::NullArray`]
40/// * `BoolArray`: [`arrow_array::BooleanArray`]
41/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
42/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
43/// * `StructArray`: [`arrow_array::StructArray`]
44/// * `ListArray`: [`arrow_array::ListArray`]
45/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
46///
47/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
48/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
49/// create ambiguity.
50/// Thus, if you receive an Arrow array, compress it using Vortex, and then
51/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
52/// variants to hold the data.
53///
54/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
55/// will correspond to an arrow-rs [`arrow_schema::DataType`].
56///
57/// # Views support
58///
59/// Binary and String views, also known as "German strings" are a better encoding format for
60/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
61/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
62/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
63/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
64/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
65/// from memory.
66#[derive(Debug, Clone)]
67pub enum Canonical {
68    Null(NullArray),
69    Bool(BoolArray),
70    Primitive(PrimitiveArray),
71    Decimal(DecimalArray),
72    Struct(StructArray),
73    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
74    List(ListArray),
75    VarBinView(VarBinViewArray),
76    Extension(ExtensionArray),
77}
78
79impl Canonical {
80    /// Create an empty canonical array of the given dtype.
81    pub fn empty(dtype: &DType) -> Canonical {
82        builder_with_capacity(dtype, 0)
83            .finish()
84            .to_canonical()
85            .vortex_expect("cannot fail to convert an empty array to canonical")
86    }
87}
88
89impl Canonical {
90    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
91    ///
92    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
93    /// they can accumulate wasted space after slicing and taking operations.
94    ///
95    /// This operation is very expensive and can result in things like allocations, full-scans
96    /// and copy operations.
97    pub fn compact(&self) -> VortexResult<Canonical> {
98        match self {
99            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(compact_buffers(array)?)),
100            other => Ok(other.clone()),
101        }
102    }
103}
104
105// Unwrap canonical type back down to specialized type.
106impl Canonical {
107    pub fn into_null(self) -> VortexResult<NullArray> {
108        match self {
109            Canonical::Null(a) => Ok(a),
110            _ => vortex_bail!("Cannot unwrap NullArray from {:?}", &self),
111        }
112    }
113
114    pub fn into_bool(self) -> VortexResult<BoolArray> {
115        match self {
116            Canonical::Bool(a) => Ok(a),
117            _ => vortex_bail!("Cannot unwrap BoolArray from {:?}", &self),
118        }
119    }
120
121    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
122        match self {
123            Canonical::Primitive(a) => Ok(a),
124            _ => vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self),
125        }
126    }
127
128    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
129        match self {
130            Canonical::Decimal(a) => Ok(a),
131            _ => vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self),
132        }
133    }
134
135    pub fn into_struct(self) -> VortexResult<StructArray> {
136        match self {
137            Canonical::Struct(a) => Ok(a),
138            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
139        }
140    }
141
142    pub fn into_list(self) -> VortexResult<ListArray> {
143        match self {
144            Canonical::List(a) => Ok(a),
145            _ => vortex_bail!("Cannot unwrap ListArray from {:?}", &self),
146        }
147    }
148
149    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
150        match self {
151            Canonical::VarBinView(a) => Ok(a),
152            _ => vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self),
153        }
154    }
155
156    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
157        match self {
158            Canonical::Extension(a) => Ok(a),
159            _ => vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self),
160        }
161    }
162}
163
164impl AsRef<dyn Array> for Canonical {
165    fn as_ref(&self) -> &(dyn Array + 'static) {
166        match &self {
167            Canonical::Null(a) => a.as_ref(),
168            Canonical::Bool(a) => a.as_ref(),
169            Canonical::Primitive(a) => a.as_ref(),
170            Canonical::Decimal(a) => a.as_ref(),
171            Canonical::Struct(a) => a.as_ref(),
172            Canonical::List(a) => a.as_ref(),
173            Canonical::VarBinView(a) => a.as_ref(),
174            Canonical::Extension(a) => a.as_ref(),
175        }
176    }
177}
178
179impl IntoArray for Canonical {
180    fn into_array(self) -> ArrayRef {
181        match self {
182            Canonical::Null(a) => a.into_array(),
183            Canonical::Bool(a) => a.into_array(),
184            Canonical::Primitive(a) => a.into_array(),
185            Canonical::Decimal(a) => a.into_array(),
186            Canonical::Struct(a) => a.into_array(),
187            Canonical::List(a) => a.into_array(),
188            Canonical::VarBinView(a) => a.into_array(),
189            Canonical::Extension(a) => a.into_array(),
190        }
191    }
192}
193
194/// Trait for types that can be converted from an owned type into an owned array variant.
195///
196/// # Canonicalization
197///
198/// This trait has a blanket implementation for all types implementing [ToCanonical].
199pub trait ToCanonical {
200    /// Canonicalize into a [`NullArray`] if the target is [`Null`][DType::Null] typed.
201    fn to_null(&self) -> VortexResult<NullArray>;
202
203    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`][DType::Bool] typed.
204    fn to_bool(&self) -> VortexResult<BoolArray>;
205
206    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`][DType::Primitive]
207    /// typed.
208    fn to_primitive(&self) -> VortexResult<PrimitiveArray>;
209
210    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`][DType::Decimal]
211    /// typed.
212    fn to_decimal(&self) -> VortexResult<DecimalArray>;
213
214    /// Canonicalize into a [`StructArray`] if the target is [`Struct`][DType::Struct] typed.
215    fn to_struct(&self) -> VortexResult<StructArray>;
216
217    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
218    fn to_list(&self) -> VortexResult<ListArray>;
219
220    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`][DType::Utf8]
221    /// or [`Binary`][DType::Binary] typed.
222    fn to_varbinview(&self) -> VortexResult<VarBinViewArray>;
223
224    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`][DType::Extension]
225    /// typed.
226    fn to_extension(&self) -> VortexResult<ExtensionArray>;
227}
228
229// Blanket impl for all Array encodings.
230impl<A: Array + ?Sized> ToCanonical for A {
231    fn to_null(&self) -> VortexResult<NullArray> {
232        self.to_canonical()?.into_null()
233    }
234
235    fn to_bool(&self) -> VortexResult<BoolArray> {
236        self.to_canonical()?.into_bool()
237    }
238
239    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
240        self.to_canonical()?.into_primitive()
241    }
242
243    fn to_decimal(&self) -> VortexResult<DecimalArray> {
244        self.to_canonical()?.into_decimal()
245    }
246
247    fn to_struct(&self) -> VortexResult<StructArray> {
248        self.to_canonical()?.into_struct()
249    }
250
251    fn to_list(&self) -> VortexResult<ListArray> {
252        self.to_canonical()?.into_list()
253    }
254
255    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
256        self.to_canonical()?.into_varbinview()
257    }
258
259    fn to_extension(&self) -> VortexResult<ExtensionArray> {
260        self.to_canonical()?.into_extension()
261    }
262}
263
264impl From<Canonical> for ArrayRef {
265    fn from(value: Canonical) -> Self {
266        match value {
267            Canonical::Null(a) => a.into_array(),
268            Canonical::Bool(a) => a.into_array(),
269            Canonical::Primitive(a) => a.into_array(),
270            Canonical::Decimal(a) => a.into_array(),
271            Canonical::Struct(a) => a.into_array(),
272            Canonical::List(a) => a.into_array(),
273            Canonical::VarBinView(a) => a.into_array(),
274            Canonical::Extension(a) => a.into_array(),
275        }
276    }
277}
278
279#[cfg(test)]
280mod test {
281    use std::sync::Arc;
282
283    use arrow_array::cast::AsArray;
284    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
285    use arrow_array::{
286        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
287        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
288        StructArray as ArrowStructArray,
289    };
290    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
291    use arrow_schema::{DataType, Field};
292    use vortex_buffer::buffer;
293
294    use crate::arrays::{ConstantArray, StructArray};
295    use crate::arrow::{FromArrowArray, IntoArrowArray};
296    use crate::{ArrayRef, IntoArray};
297
298    #[test]
299    fn test_canonicalize_nested_struct() {
300        // Create a struct array with multiple internal components.
301        let nested_struct_array = StructArray::from_fields(&[
302            ("a", buffer![1u64].into_array()),
303            (
304                "b",
305                StructArray::from_fields(&[(
306                    "inner_a",
307                    // The nested struct contains a ConstantArray representing the primitive array
308                    //   [100i64]
309                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
310                    // map this to the nearest canonical type (PrimitiveArray).
311                    ConstantArray::new(100i64, 1).into_array(),
312                )])
313                .unwrap()
314                .into_array(),
315            ),
316        ])
317        .unwrap();
318
319        let arrow_struct = nested_struct_array
320            .into_array()
321            .into_arrow_preferred()
322            .unwrap()
323            .as_any()
324            .downcast_ref::<ArrowStructArray>()
325            .cloned()
326            .unwrap();
327
328        assert!(
329            arrow_struct
330                .column(0)
331                .as_any()
332                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
333                .is_some()
334        );
335
336        let inner_struct = arrow_struct
337            .column(1)
338            .clone()
339            .as_any()
340            .downcast_ref::<ArrowStructArray>()
341            .cloned()
342            .unwrap();
343
344        let inner_a = inner_struct
345            .column(0)
346            .as_any()
347            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
348        assert!(inner_a.is_some());
349
350        assert_eq!(
351            inner_a.cloned().unwrap(),
352            ArrowPrimitiveArray::from_iter([100i64]),
353        );
354    }
355
356    #[test]
357    fn roundtrip_struct() {
358        let mut nulls = NullBufferBuilder::new(6);
359        nulls.append_n_non_nulls(4);
360        nulls.append_null();
361        nulls.append_non_null();
362        let names = Arc::new(StringViewArray::from_iter(vec![
363            Some("Joseph"),
364            None,
365            Some("Angela"),
366            Some("Mikhail"),
367            None,
368            None,
369        ]));
370        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
371            Some(25),
372            Some(31),
373            None,
374            Some(57),
375            None,
376            None,
377        ]));
378
379        let arrow_struct = ArrowStructArray::new(
380            vec![
381                Arc::new(Field::new("name", DataType::Utf8View, true)),
382                Arc::new(Field::new("age", DataType::Int32, true)),
383            ]
384            .into(),
385            vec![names, ages],
386            nulls.finish(),
387        );
388
389        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
390
391        assert_eq!(
392            &arrow_struct,
393            vortex_struct.into_arrow_preferred().unwrap().as_struct()
394        );
395    }
396
397    #[test]
398    fn roundtrip_list() {
399        let names = Arc::new(StringArray::from_iter(vec![
400            Some("Joseph"),
401            Some("Angela"),
402            Some("Mikhail"),
403        ]));
404
405        let arrow_list = ArrowListArray::new(
406            Arc::new(Field::new_list_field(DataType::Utf8, true)),
407            OffsetBuffer::from_lengths(vec![0, 2, 1]),
408            names,
409            None,
410        );
411        let list_data_type = arrow_list.data_type();
412
413        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
414
415        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
416
417        assert_eq!(
418            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
419            rt_arrow_list.as_ref()
420        );
421    }
422}