vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::{VortexResult, vortex_panic};
8
9use crate::arrays::{
10    BoolArray, DecimalArray, ExtensionArray, FixedSizeListArray, ListViewArray,
11    ListViewRebuildMode, NullArray, PrimitiveArray, StructArray, VarBinViewArray,
12};
13use crate::builders::builder_with_capacity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
17///
18/// Any array can be decoded into canonical form via the [`to_canonical`](Array::to_canonical)
19/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20/// contain compressed child arrays.
21///
22/// Canonical form is useful for doing type-specific compute where you need to know that all
23/// elements are laid out decompressed and contiguous in memory.
24///
25/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
26/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
27/// [`DType::Binary`].
28///
29/// # Laziness
30///
31/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
32/// `Struct` type, individual column child arrays may still be compressed. This allows
33/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
34/// never need to be decoded into canonical form at all depending on the compute.
35///
36/// # Arrow interoperability
37///
38/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
39/// zero-copy, and the corresponding Arrow array types can also be built directly.
40///
41/// The full list of canonical types and their equivalent Arrow array types are:
42///
43/// * `NullArray`: [`arrow_array::NullArray`]
44/// * `BoolArray`: [`arrow_array::BooleanArray`]
45/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
46/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
47/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
48/// * `ListViewArray`: [`arrow_array::ListViewArray`]
49/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
50/// * `StructArray`: [`arrow_array::StructArray`]
51///
52/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
53/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
54/// create ambiguity.
55/// Thus, if you receive an Arrow array, compress it using Vortex, and then
56/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
57/// variants to hold the data.
58///
59/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
60/// will correspond to an arrow-rs [`arrow_schema::DataType`].
61///
62/// # Views support
63///
64/// Binary and String views, also known as "German strings" are a better encoding format for
65/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
66/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
67/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
68/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
69/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
70/// from memory.
71///
72/// # For Developers
73///
74/// If you add another variant to this enum, make sure to update [`Array::is_canonical`],
75/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
76#[derive(Debug, Clone)]
77pub enum Canonical {
78    Null(NullArray),
79    Bool(BoolArray),
80    Primitive(PrimitiveArray),
81    Decimal(DecimalArray),
82    VarBinView(VarBinViewArray),
83    List(ListViewArray),
84    FixedSizeList(FixedSizeListArray),
85    Struct(StructArray),
86    Extension(ExtensionArray),
87}
88
89impl Canonical {
90    // TODO(connor): This can probably be specialized for each of the canonical arrays.
91    /// Create an empty canonical array of the given dtype.
92    pub fn empty(dtype: &DType) -> Canonical {
93        builder_with_capacity(dtype, 0).finish_into_canonical()
94    }
95}
96
97impl Canonical {
98    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
99    ///
100    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
101    /// they can accumulate wasted space after slicing and taking operations.
102    ///
103    /// This operation is very expensive and can result in things like allocations, full-scans
104    /// and copy operations.
105    pub fn compact(&self) -> VortexResult<Canonical> {
106        match self {
107            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
108            Canonical::List(array) => Ok(Canonical::List(
109                array.rebuild(ListViewRebuildMode::MakeZeroCopyToList),
110            )),
111            _ => Ok(self.clone()),
112        }
113    }
114}
115
116// Unwrap canonical type back down to specialized type.
117impl Canonical {
118    pub fn as_null(&self) -> &NullArray {
119        if let Canonical::Null(a) = self {
120            a
121        } else {
122            vortex_panic!("Cannot get NullArray from {:?}", &self)
123        }
124    }
125
126    pub fn into_null(self) -> NullArray {
127        if let Canonical::Null(a) = self {
128            a
129        } else {
130            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
131        }
132    }
133
134    pub fn as_bool(&self) -> &BoolArray {
135        if let Canonical::Bool(a) = self {
136            a
137        } else {
138            vortex_panic!("Cannot get BoolArray from {:?}", &self)
139        }
140    }
141
142    pub fn into_bool(self) -> BoolArray {
143        if let Canonical::Bool(a) = self {
144            a
145        } else {
146            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
147        }
148    }
149
150    pub fn as_primitive(&self) -> &PrimitiveArray {
151        if let Canonical::Primitive(a) = self {
152            a
153        } else {
154            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
155        }
156    }
157
158    pub fn into_primitive(self) -> PrimitiveArray {
159        if let Canonical::Primitive(a) = self {
160            a
161        } else {
162            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
163        }
164    }
165
166    pub fn as_decimal(&self) -> &DecimalArray {
167        if let Canonical::Decimal(a) = self {
168            a
169        } else {
170            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
171        }
172    }
173
174    pub fn into_decimal(self) -> DecimalArray {
175        if let Canonical::Decimal(a) = self {
176            a
177        } else {
178            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
179        }
180    }
181
182    pub fn as_varbinview(&self) -> &VarBinViewArray {
183        if let Canonical::VarBinView(a) = self {
184            a
185        } else {
186            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
187        }
188    }
189
190    pub fn into_varbinview(self) -> VarBinViewArray {
191        if let Canonical::VarBinView(a) = self {
192            a
193        } else {
194            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
195        }
196    }
197
198    pub fn as_listview(&self) -> &ListViewArray {
199        if let Canonical::List(a) = self {
200            a
201        } else {
202            vortex_panic!("Cannot get ListArray from {:?}", &self)
203        }
204    }
205
206    pub fn into_listview(self) -> ListViewArray {
207        if let Canonical::List(a) = self {
208            a
209        } else {
210            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
211        }
212    }
213
214    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
215        if let Canonical::FixedSizeList(a) = self {
216            a
217        } else {
218            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
219        }
220    }
221
222    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
223        if let Canonical::FixedSizeList(a) = self {
224            a
225        } else {
226            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
227        }
228    }
229
230    pub fn as_struct(&self) -> &StructArray {
231        if let Canonical::Struct(a) = self {
232            a
233        } else {
234            vortex_panic!("Cannot get StructArray from {:?}", &self)
235        }
236    }
237
238    pub fn into_struct(self) -> StructArray {
239        if let Canonical::Struct(a) = self {
240            a
241        } else {
242            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
243        }
244    }
245
246    pub fn as_extension(&self) -> &ExtensionArray {
247        if let Canonical::Extension(a) = self {
248            a
249        } else {
250            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
251        }
252    }
253
254    pub fn into_extension(self) -> ExtensionArray {
255        if let Canonical::Extension(a) = self {
256            a
257        } else {
258            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
259        }
260    }
261}
262
263impl AsRef<dyn Array> for Canonical {
264    fn as_ref(&self) -> &(dyn Array + 'static) {
265        match &self {
266            Canonical::Null(a) => a.as_ref(),
267            Canonical::Bool(a) => a.as_ref(),
268            Canonical::Primitive(a) => a.as_ref(),
269            Canonical::Decimal(a) => a.as_ref(),
270            Canonical::Struct(a) => a.as_ref(),
271            Canonical::List(a) => a.as_ref(),
272            Canonical::FixedSizeList(a) => a.as_ref(),
273            Canonical::VarBinView(a) => a.as_ref(),
274            Canonical::Extension(a) => a.as_ref(),
275        }
276    }
277}
278
279impl IntoArray for Canonical {
280    fn into_array(self) -> ArrayRef {
281        match self {
282            Canonical::Null(a) => a.into_array(),
283            Canonical::Bool(a) => a.into_array(),
284            Canonical::Primitive(a) => a.into_array(),
285            Canonical::Decimal(a) => a.into_array(),
286            Canonical::Struct(a) => a.into_array(),
287            Canonical::List(a) => a.into_array(),
288            Canonical::FixedSizeList(a) => a.into_array(),
289            Canonical::VarBinView(a) => a.into_array(),
290            Canonical::Extension(a) => a.into_array(),
291        }
292    }
293}
294
295/// Trait for types that can be converted from an owned type into an owned array variant.
296///
297/// # Canonicalization
298///
299/// This trait has a blanket implementation for all types implementing [ToCanonical].
300pub trait ToCanonical {
301    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
302    fn to_null(&self) -> NullArray;
303
304    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
305    fn to_bool(&self) -> BoolArray;
306
307    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
308    /// typed.
309    fn to_primitive(&self) -> PrimitiveArray;
310
311    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
312    /// typed.
313    fn to_decimal(&self) -> DecimalArray;
314
315    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
316    fn to_struct(&self) -> StructArray;
317
318    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
319    fn to_listview(&self) -> ListViewArray;
320
321    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
322    /// typed.
323    fn to_fixed_size_list(&self) -> FixedSizeListArray;
324
325    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
326    /// or [`Binary`](DType::Binary) typed.
327    fn to_varbinview(&self) -> VarBinViewArray;
328
329    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
330    /// typed.
331    fn to_extension(&self) -> ExtensionArray;
332}
333
334// Blanket impl for all Array encodings.
335impl<A: Array + ?Sized> ToCanonical for A {
336    fn to_null(&self) -> NullArray {
337        self.to_canonical().into_null()
338    }
339
340    fn to_bool(&self) -> BoolArray {
341        self.to_canonical().into_bool()
342    }
343
344    fn to_primitive(&self) -> PrimitiveArray {
345        self.to_canonical().into_primitive()
346    }
347
348    fn to_decimal(&self) -> DecimalArray {
349        self.to_canonical().into_decimal()
350    }
351
352    fn to_struct(&self) -> StructArray {
353        self.to_canonical().into_struct()
354    }
355
356    fn to_listview(&self) -> ListViewArray {
357        self.to_canonical().into_listview()
358    }
359
360    fn to_fixed_size_list(&self) -> FixedSizeListArray {
361        self.to_canonical().into_fixed_size_list()
362    }
363
364    fn to_varbinview(&self) -> VarBinViewArray {
365        self.to_canonical().into_varbinview()
366    }
367
368    fn to_extension(&self) -> ExtensionArray {
369        self.to_canonical().into_extension()
370    }
371}
372
373impl From<Canonical> for ArrayRef {
374    fn from(value: Canonical) -> Self {
375        match value {
376            Canonical::Null(a) => a.into_array(),
377            Canonical::Bool(a) => a.into_array(),
378            Canonical::Primitive(a) => a.into_array(),
379            Canonical::Decimal(a) => a.into_array(),
380            Canonical::Struct(a) => a.into_array(),
381            Canonical::List(a) => a.into_array(),
382            Canonical::FixedSizeList(a) => a.into_array(),
383            Canonical::VarBinView(a) => a.into_array(),
384            Canonical::Extension(a) => a.into_array(),
385        }
386    }
387}
388
389#[cfg(test)]
390mod test {
391    use std::sync::Arc;
392
393    use arrow_array::cast::AsArray;
394    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
395    use arrow_array::{
396        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
397        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
398        StructArray as ArrowStructArray,
399    };
400    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
401    use arrow_schema::{DataType, Field};
402    use vortex_buffer::buffer;
403
404    use crate::arrays::{ConstantArray, StructArray};
405    use crate::arrow::{FromArrowArray, IntoArrowArray};
406    use crate::{ArrayRef, IntoArray};
407
408    #[test]
409    fn test_canonicalize_nested_struct() {
410        // Create a struct array with multiple internal components.
411        let nested_struct_array = StructArray::from_fields(&[
412            ("a", buffer![1u64].into_array()),
413            (
414                "b",
415                StructArray::from_fields(&[(
416                    "inner_a",
417                    // The nested struct contains a ConstantArray representing the primitive array
418                    //   [100i64]
419                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
420                    // map this to the nearest canonical type (PrimitiveArray).
421                    ConstantArray::new(100i64, 1).into_array(),
422                )])
423                .unwrap()
424                .into_array(),
425            ),
426        ])
427        .unwrap();
428
429        let arrow_struct = nested_struct_array
430            .into_array()
431            .into_arrow_preferred()
432            .unwrap()
433            .as_any()
434            .downcast_ref::<ArrowStructArray>()
435            .cloned()
436            .unwrap();
437
438        assert!(
439            arrow_struct
440                .column(0)
441                .as_any()
442                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
443                .is_some()
444        );
445
446        let inner_struct = arrow_struct
447            .column(1)
448            .clone()
449            .as_any()
450            .downcast_ref::<ArrowStructArray>()
451            .cloned()
452            .unwrap();
453
454        let inner_a = inner_struct
455            .column(0)
456            .as_any()
457            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
458        assert!(inner_a.is_some());
459
460        assert_eq!(
461            inner_a.cloned().unwrap(),
462            ArrowPrimitiveArray::from_iter([100i64])
463        );
464    }
465
466    #[test]
467    fn roundtrip_struct() {
468        let mut nulls = NullBufferBuilder::new(6);
469        nulls.append_n_non_nulls(4);
470        nulls.append_null();
471        nulls.append_non_null();
472        let names = Arc::new(StringViewArray::from_iter(vec![
473            Some("Joseph"),
474            None,
475            Some("Angela"),
476            Some("Mikhail"),
477            None,
478            None,
479        ]));
480        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
481            Some(25),
482            Some(31),
483            None,
484            Some(57),
485            None,
486            None,
487        ]));
488
489        let arrow_struct = ArrowStructArray::new(
490            vec![
491                Arc::new(Field::new("name", DataType::Utf8View, true)),
492                Arc::new(Field::new("age", DataType::Int32, true)),
493            ]
494            .into(),
495            vec![names, ages],
496            nulls.finish(),
497        );
498
499        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
500
501        assert_eq!(
502            &arrow_struct,
503            vortex_struct.into_arrow_preferred().unwrap().as_struct()
504        );
505    }
506
507    #[test]
508    fn roundtrip_list() {
509        let names = Arc::new(StringArray::from_iter(vec![
510            Some("Joseph"),
511            Some("Angela"),
512            Some("Mikhail"),
513        ]));
514
515        let arrow_list = ArrowListArray::new(
516            Arc::new(Field::new_list_field(DataType::Utf8, true)),
517            OffsetBuffer::from_lengths(vec![0, 2, 1]),
518            names,
519            None,
520        );
521        let list_data_type = arrow_list.data_type();
522
523        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
524
525        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
526
527        assert_eq!(
528            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
529            rt_arrow_list.as_ref()
530        );
531    }
532}