vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::VortexResult;
8use vortex_error::vortex_panic;
9
10use crate::Array;
11use crate::ArrayRef;
12use crate::IntoArray;
13use crate::arrays::BoolArray;
14use crate::arrays::DecimalArray;
15use crate::arrays::ExtensionArray;
16use crate::arrays::FixedSizeListArray;
17use crate::arrays::ListViewArray;
18use crate::arrays::ListViewRebuildMode;
19use crate::arrays::NullArray;
20use crate::arrays::PrimitiveArray;
21use crate::arrays::StructArray;
22use crate::arrays::VarBinViewArray;
23use crate::builders::builder_with_capacity;
24
25/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
26///
27/// Any array can be decoded into canonical form via the [`to_canonical`](Array::to_canonical)
28/// trait method. This is the simplest encoding for a type, and will not be compressed but may
29/// contain compressed child arrays.
30///
31/// Canonical form is useful for doing type-specific compute where you need to know that all
32/// elements are laid out decompressed and contiguous in memory.
33///
34/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
35/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
36/// [`DType::Binary`].
37///
38/// # Laziness
39///
40/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
41/// `Struct` type, individual column child arrays may still be compressed. This allows
42/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
43/// never need to be decoded into canonical form at all depending on the compute.
44///
45/// # Arrow interoperability
46///
47/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
48/// zero-copy, and the corresponding Arrow array types can also be built directly.
49///
50/// The full list of canonical types and their equivalent Arrow array types are:
51///
52/// * `NullArray`: [`arrow_array::NullArray`]
53/// * `BoolArray`: [`arrow_array::BooleanArray`]
54/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
55/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
56/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
57/// * `ListViewArray`: [`arrow_array::ListViewArray`]
58/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
59/// * `StructArray`: [`arrow_array::StructArray`]
60///
61/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
62/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
63/// create ambiguity.
64/// Thus, if you receive an Arrow array, compress it using Vortex, and then
65/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
66/// variants to hold the data.
67///
68/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
69/// will correspond to an arrow-rs [`arrow_schema::DataType`].
70///
71/// # Views support
72///
73/// Binary and String views, also known as "German strings" are a better encoding format for
74/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
75/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
76/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
77/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
78/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
79/// from memory.
80///
81/// # For Developers
82///
83/// If you add another variant to this enum, make sure to update [`Array::is_canonical`],
84/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
85#[derive(Debug, Clone)]
86pub enum Canonical {
87    Null(NullArray),
88    Bool(BoolArray),
89    Primitive(PrimitiveArray),
90    Decimal(DecimalArray),
91    VarBinView(VarBinViewArray),
92    List(ListViewArray),
93    FixedSizeList(FixedSizeListArray),
94    Struct(StructArray),
95    Extension(ExtensionArray),
96}
97
98impl Canonical {
99    // TODO(connor): This can probably be specialized for each of the canonical arrays.
100    /// Create an empty canonical array of the given dtype.
101    pub fn empty(dtype: &DType) -> Canonical {
102        builder_with_capacity(dtype, 0).finish_into_canonical()
103    }
104}
105
106impl Canonical {
107    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
108    ///
109    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
110    /// they can accumulate wasted space after slicing and taking operations.
111    ///
112    /// This operation is very expensive and can result in things like allocations, full-scans
113    /// and copy operations.
114    pub fn compact(&self) -> VortexResult<Canonical> {
115        match self {
116            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
117            Canonical::List(array) => Ok(Canonical::List(
118                array.rebuild(ListViewRebuildMode::MakeZeroCopyToList),
119            )),
120            _ => Ok(self.clone()),
121        }
122    }
123}
124
125// Unwrap canonical type back down to specialized type.
126impl Canonical {
127    pub fn as_null(&self) -> &NullArray {
128        if let Canonical::Null(a) = self {
129            a
130        } else {
131            vortex_panic!("Cannot get NullArray from {:?}", &self)
132        }
133    }
134
135    pub fn into_null(self) -> NullArray {
136        if let Canonical::Null(a) = self {
137            a
138        } else {
139            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
140        }
141    }
142
143    pub fn as_bool(&self) -> &BoolArray {
144        if let Canonical::Bool(a) = self {
145            a
146        } else {
147            vortex_panic!("Cannot get BoolArray from {:?}", &self)
148        }
149    }
150
151    pub fn into_bool(self) -> BoolArray {
152        if let Canonical::Bool(a) = self {
153            a
154        } else {
155            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
156        }
157    }
158
159    pub fn as_primitive(&self) -> &PrimitiveArray {
160        if let Canonical::Primitive(a) = self {
161            a
162        } else {
163            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
164        }
165    }
166
167    pub fn into_primitive(self) -> PrimitiveArray {
168        if let Canonical::Primitive(a) = self {
169            a
170        } else {
171            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
172        }
173    }
174
175    pub fn as_decimal(&self) -> &DecimalArray {
176        if let Canonical::Decimal(a) = self {
177            a
178        } else {
179            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
180        }
181    }
182
183    pub fn into_decimal(self) -> DecimalArray {
184        if let Canonical::Decimal(a) = self {
185            a
186        } else {
187            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
188        }
189    }
190
191    pub fn as_varbinview(&self) -> &VarBinViewArray {
192        if let Canonical::VarBinView(a) = self {
193            a
194        } else {
195            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
196        }
197    }
198
199    pub fn into_varbinview(self) -> VarBinViewArray {
200        if let Canonical::VarBinView(a) = self {
201            a
202        } else {
203            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
204        }
205    }
206
207    pub fn as_listview(&self) -> &ListViewArray {
208        if let Canonical::List(a) = self {
209            a
210        } else {
211            vortex_panic!("Cannot get ListArray from {:?}", &self)
212        }
213    }
214
215    pub fn into_listview(self) -> ListViewArray {
216        if let Canonical::List(a) = self {
217            a
218        } else {
219            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
220        }
221    }
222
223    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
224        if let Canonical::FixedSizeList(a) = self {
225            a
226        } else {
227            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
228        }
229    }
230
231    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
232        if let Canonical::FixedSizeList(a) = self {
233            a
234        } else {
235            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
236        }
237    }
238
239    pub fn as_struct(&self) -> &StructArray {
240        if let Canonical::Struct(a) = self {
241            a
242        } else {
243            vortex_panic!("Cannot get StructArray from {:?}", &self)
244        }
245    }
246
247    pub fn into_struct(self) -> StructArray {
248        if let Canonical::Struct(a) = self {
249            a
250        } else {
251            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
252        }
253    }
254
255    pub fn as_extension(&self) -> &ExtensionArray {
256        if let Canonical::Extension(a) = self {
257            a
258        } else {
259            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
260        }
261    }
262
263    pub fn into_extension(self) -> ExtensionArray {
264        if let Canonical::Extension(a) = self {
265            a
266        } else {
267            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
268        }
269    }
270}
271
272impl AsRef<dyn Array> for Canonical {
273    fn as_ref(&self) -> &(dyn Array + 'static) {
274        match &self {
275            Canonical::Null(a) => a.as_ref(),
276            Canonical::Bool(a) => a.as_ref(),
277            Canonical::Primitive(a) => a.as_ref(),
278            Canonical::Decimal(a) => a.as_ref(),
279            Canonical::Struct(a) => a.as_ref(),
280            Canonical::List(a) => a.as_ref(),
281            Canonical::FixedSizeList(a) => a.as_ref(),
282            Canonical::VarBinView(a) => a.as_ref(),
283            Canonical::Extension(a) => a.as_ref(),
284        }
285    }
286}
287
288impl IntoArray for Canonical {
289    fn into_array(self) -> ArrayRef {
290        match self {
291            Canonical::Null(a) => a.into_array(),
292            Canonical::Bool(a) => a.into_array(),
293            Canonical::Primitive(a) => a.into_array(),
294            Canonical::Decimal(a) => a.into_array(),
295            Canonical::Struct(a) => a.into_array(),
296            Canonical::List(a) => a.into_array(),
297            Canonical::FixedSizeList(a) => a.into_array(),
298            Canonical::VarBinView(a) => a.into_array(),
299            Canonical::Extension(a) => a.into_array(),
300        }
301    }
302}
303
304/// Trait for types that can be converted from an owned type into an owned array variant.
305///
306/// # Canonicalization
307///
308/// This trait has a blanket implementation for all types implementing [ToCanonical].
309pub trait ToCanonical {
310    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
311    fn to_null(&self) -> NullArray;
312
313    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
314    fn to_bool(&self) -> BoolArray;
315
316    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
317    /// typed.
318    fn to_primitive(&self) -> PrimitiveArray;
319
320    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
321    /// typed.
322    fn to_decimal(&self) -> DecimalArray;
323
324    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
325    fn to_struct(&self) -> StructArray;
326
327    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
328    fn to_listview(&self) -> ListViewArray;
329
330    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
331    /// typed.
332    fn to_fixed_size_list(&self) -> FixedSizeListArray;
333
334    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
335    /// or [`Binary`](DType::Binary) typed.
336    fn to_varbinview(&self) -> VarBinViewArray;
337
338    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
339    /// typed.
340    fn to_extension(&self) -> ExtensionArray;
341}
342
343// Blanket impl for all Array encodings.
344impl<A: Array + ?Sized> ToCanonical for A {
345    fn to_null(&self) -> NullArray {
346        self.to_canonical().into_null()
347    }
348
349    fn to_bool(&self) -> BoolArray {
350        self.to_canonical().into_bool()
351    }
352
353    fn to_primitive(&self) -> PrimitiveArray {
354        self.to_canonical().into_primitive()
355    }
356
357    fn to_decimal(&self) -> DecimalArray {
358        self.to_canonical().into_decimal()
359    }
360
361    fn to_struct(&self) -> StructArray {
362        self.to_canonical().into_struct()
363    }
364
365    fn to_listview(&self) -> ListViewArray {
366        self.to_canonical().into_listview()
367    }
368
369    fn to_fixed_size_list(&self) -> FixedSizeListArray {
370        self.to_canonical().into_fixed_size_list()
371    }
372
373    fn to_varbinview(&self) -> VarBinViewArray {
374        self.to_canonical().into_varbinview()
375    }
376
377    fn to_extension(&self) -> ExtensionArray {
378        self.to_canonical().into_extension()
379    }
380}
381
382impl From<Canonical> for ArrayRef {
383    fn from(value: Canonical) -> Self {
384        match value {
385            Canonical::Null(a) => a.into_array(),
386            Canonical::Bool(a) => a.into_array(),
387            Canonical::Primitive(a) => a.into_array(),
388            Canonical::Decimal(a) => a.into_array(),
389            Canonical::Struct(a) => a.into_array(),
390            Canonical::List(a) => a.into_array(),
391            Canonical::FixedSizeList(a) => a.into_array(),
392            Canonical::VarBinView(a) => a.into_array(),
393            Canonical::Extension(a) => a.into_array(),
394        }
395    }
396}
397
398#[cfg(test)]
399mod test {
400    use std::sync::Arc;
401
402    use arrow_array::Array as ArrowArray;
403    use arrow_array::ArrayRef as ArrowArrayRef;
404    use arrow_array::ListArray as ArrowListArray;
405    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
406    use arrow_array::StringArray;
407    use arrow_array::StringViewArray;
408    use arrow_array::StructArray as ArrowStructArray;
409    use arrow_array::cast::AsArray;
410    use arrow_array::types::Int32Type;
411    use arrow_array::types::Int64Type;
412    use arrow_array::types::UInt64Type;
413    use arrow_buffer::NullBufferBuilder;
414    use arrow_buffer::OffsetBuffer;
415    use arrow_schema::DataType;
416    use arrow_schema::Field;
417    use vortex_buffer::buffer;
418
419    use crate::ArrayRef;
420    use crate::IntoArray;
421    use crate::arrays::ConstantArray;
422    use crate::arrays::StructArray;
423    use crate::arrow::FromArrowArray;
424    use crate::arrow::IntoArrowArray;
425
426    #[test]
427    fn test_canonicalize_nested_struct() {
428        // Create a struct array with multiple internal components.
429        let nested_struct_array = StructArray::from_fields(&[
430            ("a", buffer![1u64].into_array()),
431            (
432                "b",
433                StructArray::from_fields(&[(
434                    "inner_a",
435                    // The nested struct contains a ConstantArray representing the primitive array
436                    //   [100i64]
437                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
438                    // map this to the nearest canonical type (PrimitiveArray).
439                    ConstantArray::new(100i64, 1).into_array(),
440                )])
441                .unwrap()
442                .into_array(),
443            ),
444        ])
445        .unwrap();
446
447        let arrow_struct = nested_struct_array
448            .into_array()
449            .into_arrow_preferred()
450            .unwrap()
451            .as_any()
452            .downcast_ref::<ArrowStructArray>()
453            .cloned()
454            .unwrap();
455
456        assert!(
457            arrow_struct
458                .column(0)
459                .as_any()
460                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
461                .is_some()
462        );
463
464        let inner_struct = arrow_struct
465            .column(1)
466            .clone()
467            .as_any()
468            .downcast_ref::<ArrowStructArray>()
469            .cloned()
470            .unwrap();
471
472        let inner_a = inner_struct
473            .column(0)
474            .as_any()
475            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
476        assert!(inner_a.is_some());
477
478        assert_eq!(
479            inner_a.cloned().unwrap(),
480            ArrowPrimitiveArray::from_iter([100i64])
481        );
482    }
483
484    #[test]
485    fn roundtrip_struct() {
486        let mut nulls = NullBufferBuilder::new(6);
487        nulls.append_n_non_nulls(4);
488        nulls.append_null();
489        nulls.append_non_null();
490        let names = Arc::new(StringViewArray::from_iter(vec![
491            Some("Joseph"),
492            None,
493            Some("Angela"),
494            Some("Mikhail"),
495            None,
496            None,
497        ]));
498        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
499            Some(25),
500            Some(31),
501            None,
502            Some(57),
503            None,
504            None,
505        ]));
506
507        let arrow_struct = ArrowStructArray::new(
508            vec![
509                Arc::new(Field::new("name", DataType::Utf8View, true)),
510                Arc::new(Field::new("age", DataType::Int32, true)),
511            ]
512            .into(),
513            vec![names, ages],
514            nulls.finish(),
515        );
516
517        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
518
519        assert_eq!(
520            &arrow_struct,
521            vortex_struct.into_arrow_preferred().unwrap().as_struct()
522        );
523    }
524
525    #[test]
526    fn roundtrip_list() {
527        let names = Arc::new(StringArray::from_iter(vec![
528            Some("Joseph"),
529            Some("Angela"),
530            Some("Mikhail"),
531        ]));
532
533        let arrow_list = ArrowListArray::new(
534            Arc::new(Field::new_list_field(DataType::Utf8, true)),
535            OffsetBuffer::from_lengths(vec![0, 2, 1]),
536            names,
537            None,
538        );
539        let list_data_type = arrow_list.data_type();
540
541        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
542
543        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
544
545        assert_eq!(
546            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
547            rt_arrow_list.as_ref()
548        );
549    }
550}