vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::{VortexResult, vortex_panic};
8
9use crate::arrays::{
10    BoolArray, DecimalArray, ExtensionArray, FixedSizeListArray, ListViewArray,
11    ListViewRebuildMode, NullArray, PrimitiveArray, StructArray, VarBinViewArray,
12};
13use crate::builders::builder_with_capacity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
17///
18/// Any array can be decoded into canonical form via the [`to_canonical`](Array::to_canonical)
19/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20/// contain compressed child arrays.
21///
22/// Canonical form is useful for doing type-specific compute where you need to know that all
23/// elements are laid out decompressed and contiguous in memory.
24///
25/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
26/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
27/// [`DType::Binary`].
28///
29/// # Laziness
30///
31/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
32/// `Struct` type, individual column child arrays may still be compressed. This allows
33/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
34/// never need to be decoded into canonical form at all depending on the compute.
35///
36/// # Arrow interoperability
37///
38/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
39/// zero-copy, and the corresponding Arrow array types can also be built directly.
40///
41/// The full list of canonical types and their equivalent Arrow array types are:
42///
43/// * `NullArray`: [`arrow_array::NullArray`]
44/// * `BoolArray`: [`arrow_array::BooleanArray`]
45/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
46/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
47/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
48/// * `ListViewArray`: [`arrow_array::ListViewArray`]
49/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
50/// * `StructArray`: [`arrow_array::StructArray`]
51///
52/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
53/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
54/// create ambiguity.
55/// Thus, if you receive an Arrow array, compress it using Vortex, and then
56/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
57/// variants to hold the data.
58///
59/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
60/// will correspond to an arrow-rs [`arrow_schema::DataType`].
61///
62/// # Views support
63///
64/// Binary and String views, also known as "German strings" are a better encoding format for
65/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
66/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
67/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
68/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
69/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
70/// from memory.
71///
72/// # For Developers
73///
74/// If you add another variant to this enum, make sure to update [`Array::is_canonical`],
75/// [`ArrayRegistry::canonical_only`], and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
76///
77/// [`ArrayRegistry::canonical_only`]: crate::ArrayRegistry::canonical_only
78#[derive(Debug, Clone)]
79pub enum Canonical {
80    Null(NullArray),
81    Bool(BoolArray),
82    Primitive(PrimitiveArray),
83    Decimal(DecimalArray),
84    VarBinView(VarBinViewArray),
85    List(ListViewArray),
86    FixedSizeList(FixedSizeListArray),
87    Struct(StructArray),
88    Extension(ExtensionArray),
89}
90
91impl Canonical {
92    // TODO(connor): This can probably be specialized for each of the canonical arrays.
93    /// Create an empty canonical array of the given dtype.
94    pub fn empty(dtype: &DType) -> Canonical {
95        builder_with_capacity(dtype, 0).finish_into_canonical()
96    }
97}
98
99impl Canonical {
100    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
101    ///
102    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
103    /// they can accumulate wasted space after slicing and taking operations.
104    ///
105    /// This operation is very expensive and can result in things like allocations, full-scans
106    /// and copy operations.
107    pub fn compact(&self) -> VortexResult<Canonical> {
108        match self {
109            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
110            Canonical::List(array) => Ok(Canonical::List(
111                array.rebuild(ListViewRebuildMode::MakeZeroCopyToList),
112            )),
113            _ => Ok(self.clone()),
114        }
115    }
116}
117
118// Unwrap canonical type back down to specialized type.
119impl Canonical {
120    pub fn as_null(&self) -> &NullArray {
121        if let Canonical::Null(a) = self {
122            a
123        } else {
124            vortex_panic!("Cannot get NullArray from {:?}", &self)
125        }
126    }
127
128    pub fn into_null(self) -> NullArray {
129        if let Canonical::Null(a) = self {
130            a
131        } else {
132            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
133        }
134    }
135
136    pub fn as_bool(&self) -> &BoolArray {
137        if let Canonical::Bool(a) = self {
138            a
139        } else {
140            vortex_panic!("Cannot get BoolArray from {:?}", &self)
141        }
142    }
143
144    pub fn into_bool(self) -> BoolArray {
145        if let Canonical::Bool(a) = self {
146            a
147        } else {
148            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
149        }
150    }
151
152    pub fn as_primitive(&self) -> &PrimitiveArray {
153        if let Canonical::Primitive(a) = self {
154            a
155        } else {
156            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
157        }
158    }
159
160    pub fn into_primitive(self) -> PrimitiveArray {
161        if let Canonical::Primitive(a) = self {
162            a
163        } else {
164            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
165        }
166    }
167
168    pub fn as_decimal(&self) -> &DecimalArray {
169        if let Canonical::Decimal(a) = self {
170            a
171        } else {
172            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
173        }
174    }
175
176    pub fn into_decimal(self) -> DecimalArray {
177        if let Canonical::Decimal(a) = self {
178            a
179        } else {
180            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
181        }
182    }
183
184    pub fn as_varbinview(&self) -> &VarBinViewArray {
185        if let Canonical::VarBinView(a) = self {
186            a
187        } else {
188            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
189        }
190    }
191
192    pub fn into_varbinview(self) -> VarBinViewArray {
193        if let Canonical::VarBinView(a) = self {
194            a
195        } else {
196            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
197        }
198    }
199
200    pub fn as_listview(&self) -> &ListViewArray {
201        if let Canonical::List(a) = self {
202            a
203        } else {
204            vortex_panic!("Cannot get ListArray from {:?}", &self)
205        }
206    }
207
208    pub fn into_listview(self) -> ListViewArray {
209        if let Canonical::List(a) = self {
210            a
211        } else {
212            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
213        }
214    }
215
216    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
217        if let Canonical::FixedSizeList(a) = self {
218            a
219        } else {
220            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
221        }
222    }
223
224    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
225        if let Canonical::FixedSizeList(a) = self {
226            a
227        } else {
228            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
229        }
230    }
231
232    pub fn as_struct(&self) -> &StructArray {
233        if let Canonical::Struct(a) = self {
234            a
235        } else {
236            vortex_panic!("Cannot get StructArray from {:?}", &self)
237        }
238    }
239
240    pub fn into_struct(self) -> StructArray {
241        if let Canonical::Struct(a) = self {
242            a
243        } else {
244            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
245        }
246    }
247
248    pub fn as_extension(&self) -> &ExtensionArray {
249        if let Canonical::Extension(a) = self {
250            a
251        } else {
252            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
253        }
254    }
255
256    pub fn into_extension(self) -> ExtensionArray {
257        if let Canonical::Extension(a) = self {
258            a
259        } else {
260            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
261        }
262    }
263}
264
265impl AsRef<dyn Array> for Canonical {
266    fn as_ref(&self) -> &(dyn Array + 'static) {
267        match &self {
268            Canonical::Null(a) => a.as_ref(),
269            Canonical::Bool(a) => a.as_ref(),
270            Canonical::Primitive(a) => a.as_ref(),
271            Canonical::Decimal(a) => a.as_ref(),
272            Canonical::Struct(a) => a.as_ref(),
273            Canonical::List(a) => a.as_ref(),
274            Canonical::FixedSizeList(a) => a.as_ref(),
275            Canonical::VarBinView(a) => a.as_ref(),
276            Canonical::Extension(a) => a.as_ref(),
277        }
278    }
279}
280
281impl IntoArray for Canonical {
282    fn into_array(self) -> ArrayRef {
283        match self {
284            Canonical::Null(a) => a.into_array(),
285            Canonical::Bool(a) => a.into_array(),
286            Canonical::Primitive(a) => a.into_array(),
287            Canonical::Decimal(a) => a.into_array(),
288            Canonical::Struct(a) => a.into_array(),
289            Canonical::List(a) => a.into_array(),
290            Canonical::FixedSizeList(a) => a.into_array(),
291            Canonical::VarBinView(a) => a.into_array(),
292            Canonical::Extension(a) => a.into_array(),
293        }
294    }
295}
296
297/// Trait for types that can be converted from an owned type into an owned array variant.
298///
299/// # Canonicalization
300///
301/// This trait has a blanket implementation for all types implementing [ToCanonical].
302pub trait ToCanonical {
303    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
304    fn to_null(&self) -> NullArray;
305
306    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
307    fn to_bool(&self) -> BoolArray;
308
309    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
310    /// typed.
311    fn to_primitive(&self) -> PrimitiveArray;
312
313    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
314    /// typed.
315    fn to_decimal(&self) -> DecimalArray;
316
317    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
318    fn to_struct(&self) -> StructArray;
319
320    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
321    fn to_listview(&self) -> ListViewArray;
322
323    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
324    /// typed.
325    fn to_fixed_size_list(&self) -> FixedSizeListArray;
326
327    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
328    /// or [`Binary`](DType::Binary) typed.
329    fn to_varbinview(&self) -> VarBinViewArray;
330
331    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
332    /// typed.
333    fn to_extension(&self) -> ExtensionArray;
334}
335
336// Blanket impl for all Array encodings.
337impl<A: Array + ?Sized> ToCanonical for A {
338    fn to_null(&self) -> NullArray {
339        self.to_canonical().into_null()
340    }
341
342    fn to_bool(&self) -> BoolArray {
343        self.to_canonical().into_bool()
344    }
345
346    fn to_primitive(&self) -> PrimitiveArray {
347        self.to_canonical().into_primitive()
348    }
349
350    fn to_decimal(&self) -> DecimalArray {
351        self.to_canonical().into_decimal()
352    }
353
354    fn to_struct(&self) -> StructArray {
355        self.to_canonical().into_struct()
356    }
357
358    fn to_listview(&self) -> ListViewArray {
359        self.to_canonical().into_listview()
360    }
361
362    fn to_fixed_size_list(&self) -> FixedSizeListArray {
363        self.to_canonical().into_fixed_size_list()
364    }
365
366    fn to_varbinview(&self) -> VarBinViewArray {
367        self.to_canonical().into_varbinview()
368    }
369
370    fn to_extension(&self) -> ExtensionArray {
371        self.to_canonical().into_extension()
372    }
373}
374
375impl From<Canonical> for ArrayRef {
376    fn from(value: Canonical) -> Self {
377        match value {
378            Canonical::Null(a) => a.into_array(),
379            Canonical::Bool(a) => a.into_array(),
380            Canonical::Primitive(a) => a.into_array(),
381            Canonical::Decimal(a) => a.into_array(),
382            Canonical::Struct(a) => a.into_array(),
383            Canonical::List(a) => a.into_array(),
384            Canonical::FixedSizeList(a) => a.into_array(),
385            Canonical::VarBinView(a) => a.into_array(),
386            Canonical::Extension(a) => a.into_array(),
387        }
388    }
389}
390
391#[cfg(test)]
392mod test {
393    use std::sync::Arc;
394
395    use arrow_array::cast::AsArray;
396    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
397    use arrow_array::{
398        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
399        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
400        StructArray as ArrowStructArray,
401    };
402    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
403    use arrow_schema::{DataType, Field};
404    use vortex_buffer::buffer;
405
406    use crate::arrays::{ConstantArray, StructArray};
407    use crate::arrow::{FromArrowArray, IntoArrowArray};
408    use crate::{ArrayRef, IntoArray};
409
410    #[test]
411    fn test_canonicalize_nested_struct() {
412        // Create a struct array with multiple internal components.
413        let nested_struct_array = StructArray::from_fields(&[
414            ("a", buffer![1u64].into_array()),
415            (
416                "b",
417                StructArray::from_fields(&[(
418                    "inner_a",
419                    // The nested struct contains a ConstantArray representing the primitive array
420                    //   [100i64]
421                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
422                    // map this to the nearest canonical type (PrimitiveArray).
423                    ConstantArray::new(100i64, 1).into_array(),
424                )])
425                .unwrap()
426                .into_array(),
427            ),
428        ])
429        .unwrap();
430
431        let arrow_struct = nested_struct_array
432            .into_array()
433            .into_arrow_preferred()
434            .unwrap()
435            .as_any()
436            .downcast_ref::<ArrowStructArray>()
437            .cloned()
438            .unwrap();
439
440        assert!(
441            arrow_struct
442                .column(0)
443                .as_any()
444                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
445                .is_some()
446        );
447
448        let inner_struct = arrow_struct
449            .column(1)
450            .clone()
451            .as_any()
452            .downcast_ref::<ArrowStructArray>()
453            .cloned()
454            .unwrap();
455
456        let inner_a = inner_struct
457            .column(0)
458            .as_any()
459            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
460        assert!(inner_a.is_some());
461
462        assert_eq!(
463            inner_a.cloned().unwrap(),
464            ArrowPrimitiveArray::from_iter([100i64])
465        );
466    }
467
468    #[test]
469    fn roundtrip_struct() {
470        let mut nulls = NullBufferBuilder::new(6);
471        nulls.append_n_non_nulls(4);
472        nulls.append_null();
473        nulls.append_non_null();
474        let names = Arc::new(StringViewArray::from_iter(vec![
475            Some("Joseph"),
476            None,
477            Some("Angela"),
478            Some("Mikhail"),
479            None,
480            None,
481        ]));
482        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
483            Some(25),
484            Some(31),
485            None,
486            Some(57),
487            None,
488            None,
489        ]));
490
491        let arrow_struct = ArrowStructArray::new(
492            vec![
493                Arc::new(Field::new("name", DataType::Utf8View, true)),
494                Arc::new(Field::new("age", DataType::Int32, true)),
495            ]
496            .into(),
497            vec![names, ages],
498            nulls.finish(),
499        );
500
501        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
502
503        assert_eq!(
504            &arrow_struct,
505            vortex_struct.into_arrow_preferred().unwrap().as_struct()
506        );
507    }
508
509    #[test]
510    fn roundtrip_list() {
511        let names = Arc::new(StringArray::from_iter(vec![
512            Some("Joseph"),
513            Some("Angela"),
514            Some("Mikhail"),
515        ]));
516
517        let arrow_list = ArrowListArray::new(
518            Arc::new(Field::new_list_field(DataType::Utf8, true)),
519            OffsetBuffer::from_lengths(vec![0, 2, 1]),
520            names,
521            None,
522        );
523        let list_data_type = arrow_list.data_type();
524
525        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
526
527        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
528
529        assert_eq!(
530            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
531            rt_arrow_list.as_ref()
532        );
533    }
534}