Skip to main content

vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use std::sync::Arc;
7
8use vortex_buffer::BitBuffer;
9use vortex_buffer::Buffer;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_panic;
14
15use crate::ArrayRef;
16use crate::DynArray;
17use crate::Executable;
18use crate::ExecutionCtx;
19use crate::IntoArray;
20use crate::arrays::Bool;
21use crate::arrays::BoolArray;
22use crate::arrays::Decimal;
23use crate::arrays::DecimalArray;
24use crate::arrays::Extension;
25use crate::arrays::ExtensionArray;
26use crate::arrays::FixedSizeList;
27use crate::arrays::FixedSizeListArray;
28use crate::arrays::ListView;
29use crate::arrays::ListViewArray;
30use crate::arrays::Null;
31use crate::arrays::NullArray;
32use crate::arrays::Primitive;
33use crate::arrays::PrimitiveArray;
34use crate::arrays::Struct;
35use crate::arrays::StructArray;
36use crate::arrays::VarBinView;
37use crate::arrays::VarBinViewArray;
38use crate::arrays::bool::BoolArrayParts;
39use crate::arrays::decimal::DecimalArrayParts;
40use crate::arrays::listview::ListViewArrayParts;
41use crate::arrays::listview::ListViewRebuildMode;
42use crate::arrays::primitive::PrimitiveArrayParts;
43use crate::arrays::struct_::StructArrayParts;
44use crate::arrays::varbinview::VarBinViewArrayParts;
45use crate::dtype::DType;
46use crate::dtype::NativePType;
47use crate::dtype::Nullability;
48use crate::dtype::PType;
49use crate::match_each_decimal_value_type;
50use crate::match_each_native_ptype;
51use crate::matcher::Matcher;
52use crate::validity::Validity;
53
54/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
55///
56/// Any array can be decoded into canonical form via the [`to_canonical`](DynArray::to_canonical)
57/// trait method. This is the simplest encoding for a type, and will not be compressed but may
58/// contain compressed child arrays.
59///
60/// Canonical form is useful for doing type-specific compute where you need to know that all
61/// elements are laid out decompressed and contiguous in memory.
62///
63/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
64/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
65/// [`DType::Binary`].
66///
67/// # Laziness
68///
69/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
70/// `Struct` type, individual column child arrays may still be compressed. This allows
71/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
72/// never need to be decoded into canonical form at all depending on the compute.
73///
74/// # Arrow interoperability
75///
76/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
77/// zero-copy, and the corresponding Arrow array types can also be built directly.
78///
79/// The full list of canonical types and their equivalent Arrow array types are:
80///
81/// * `NullArray`: [`arrow_array::NullArray`]
82/// * `BoolArray`: [`arrow_array::BooleanArray`]
83/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
84/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
85/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
86/// * `ListViewArray`: [`arrow_array::ListViewArray`]
87/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
88/// * `StructArray`: [`arrow_array::StructArray`]
89///
90/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
91/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
92/// create ambiguity.
93/// Thus, if you receive an Arrow array, compress it using Vortex, and then
94/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
95/// variants to hold the data.
96///
97/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
98/// will correspond to an arrow-rs [`arrow_schema::DataType`].
99///
100/// # Views support
101///
102/// Binary and String views, also known as "German strings" are a better encoding format for
103/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
104/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
105/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
106/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
107/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
108/// from memory.
109///
110/// # For Developers
111///
112/// If you add another variant to this enum, make sure to update `dyn Array::is_canonical`,
113/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
114#[derive(Debug, Clone)]
115pub enum Canonical {
116    Null(NullArray),
117    Bool(BoolArray),
118    Primitive(PrimitiveArray),
119    Decimal(DecimalArray),
120    VarBinView(VarBinViewArray),
121    List(ListViewArray),
122    FixedSizeList(FixedSizeListArray),
123    Struct(StructArray),
124    Extension(ExtensionArray),
125}
126
127/// Match on every canonical variant and evaluate a code block on all variants
128macro_rules! match_each_canonical {
129    ($self:expr, | $ident:ident | $eval:expr) => {{
130        match $self {
131            Canonical::Null($ident) => $eval,
132            Canonical::Bool($ident) => $eval,
133            Canonical::Primitive($ident) => $eval,
134            Canonical::Decimal($ident) => $eval,
135            Canonical::VarBinView($ident) => $eval,
136            Canonical::List($ident) => $eval,
137            Canonical::FixedSizeList($ident) => $eval,
138            Canonical::Struct($ident) => $eval,
139            Canonical::Extension($ident) => $eval,
140        }
141    }};
142}
143
144impl Canonical {
145    /// Create an empty canonical array of the given dtype.
146    pub fn empty(dtype: &DType) -> Canonical {
147        match dtype {
148            DType::Null => Canonical::Null(NullArray::new(0)),
149            DType::Bool(n) => Canonical::Bool(unsafe {
150                BoolArray::new_unchecked(BitBuffer::empty(), Validity::from(n))
151            }),
152            DType::Primitive(ptype, n) => {
153                match_each_native_ptype!(ptype, |P| {
154                    Canonical::Primitive(unsafe {
155                        PrimitiveArray::new_unchecked(Buffer::<P>::empty(), Validity::from(n))
156                    })
157                })
158            }
159            DType::Decimal(decimal_type, n) => {
160                match_each_decimal_value_type!(
161                    DecimalType::smallest_decimal_value_type(decimal_type),
162                    |D| {
163                        Canonical::Decimal(unsafe {
164                            DecimalArray::new_unchecked::<D>(
165                                Buffer::empty(),
166                                *decimal_type,
167                                Validity::from(n),
168                            )
169                        })
170                    }
171                )
172            }
173            DType::Utf8(n) => Canonical::VarBinView(unsafe {
174                VarBinViewArray::new_unchecked(
175                    Buffer::empty(),
176                    Arc::new([]),
177                    dtype.clone(),
178                    Validity::from(n),
179                )
180            }),
181            DType::Binary(n) => Canonical::VarBinView(unsafe {
182                VarBinViewArray::new_unchecked(
183                    Buffer::empty(),
184                    Arc::new([]),
185                    dtype.clone(),
186                    Validity::from(n),
187                )
188            }),
189            DType::Struct(struct_dtype, n) => Canonical::Struct(unsafe {
190                StructArray::new_unchecked(
191                    struct_dtype
192                        .fields()
193                        .map(|f| Canonical::empty(&f).into_array())
194                        .collect::<Arc<[_]>>(),
195                    struct_dtype.clone(),
196                    0,
197                    Validity::from(n),
198                )
199            }),
200            DType::List(dtype, n) => Canonical::List(unsafe {
201                ListViewArray::new_unchecked(
202                    Canonical::empty(dtype).into_array(),
203                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
204                        .into_array(),
205                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
206                        .into_array(),
207                    Validity::from(n),
208                )
209                // An empty list view is trivially copyable to a list.
210                .with_zero_copy_to_list(true)
211            }),
212            DType::FixedSizeList(elem_dtype, list_size, null) => Canonical::FixedSizeList(unsafe {
213                FixedSizeListArray::new_unchecked(
214                    Canonical::empty(elem_dtype).into_array(),
215                    *list_size,
216                    Validity::from(null),
217                    0,
218                )
219            }),
220            DType::Extension(ext_dtype) => Canonical::Extension(ExtensionArray::new(
221                ext_dtype.clone(),
222                Canonical::empty(ext_dtype.storage_dtype()).into_array(),
223            )),
224        }
225    }
226
227    pub fn len(&self) -> usize {
228        match_each_canonical!(self, |arr| arr.len())
229    }
230
231    pub fn dtype(&self) -> &DType {
232        match_each_canonical!(self, |arr| arr.dtype())
233    }
234
235    pub fn is_empty(&self) -> bool {
236        match_each_canonical!(self, |arr| arr.is_empty())
237    }
238}
239
240impl Canonical {
241    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
242    ///
243    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
244    /// they can accumulate wasted space after slicing and taking operations.
245    ///
246    /// This operation is very expensive and can result in things like allocations, full-scans
247    /// and copy operations.
248    pub fn compact(&self) -> VortexResult<Canonical> {
249        match self {
250            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
251            Canonical::List(array) => Ok(Canonical::List(
252                array.rebuild(ListViewRebuildMode::TrimElements)?,
253            )),
254            _ => Ok(self.clone()),
255        }
256    }
257}
258
259// Unwrap canonical type back down to specialized type.
260impl Canonical {
261    pub fn as_null(&self) -> &NullArray {
262        if let Canonical::Null(a) = self {
263            a
264        } else {
265            vortex_panic!("Cannot get NullArray from {:?}", &self)
266        }
267    }
268
269    pub fn into_null(self) -> NullArray {
270        if let Canonical::Null(a) = self {
271            a
272        } else {
273            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
274        }
275    }
276
277    pub fn as_bool(&self) -> &BoolArray {
278        if let Canonical::Bool(a) = self {
279            a
280        } else {
281            vortex_panic!("Cannot get BoolArray from {:?}", &self)
282        }
283    }
284
285    pub fn into_bool(self) -> BoolArray {
286        if let Canonical::Bool(a) = self {
287            a
288        } else {
289            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
290        }
291    }
292
293    pub fn as_primitive(&self) -> &PrimitiveArray {
294        if let Canonical::Primitive(a) = self {
295            a
296        } else {
297            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
298        }
299    }
300
301    pub fn into_primitive(self) -> PrimitiveArray {
302        if let Canonical::Primitive(a) = self {
303            a
304        } else {
305            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
306        }
307    }
308
309    pub fn as_decimal(&self) -> &DecimalArray {
310        if let Canonical::Decimal(a) = self {
311            a
312        } else {
313            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
314        }
315    }
316
317    pub fn into_decimal(self) -> DecimalArray {
318        if let Canonical::Decimal(a) = self {
319            a
320        } else {
321            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
322        }
323    }
324
325    pub fn as_varbinview(&self) -> &VarBinViewArray {
326        if let Canonical::VarBinView(a) = self {
327            a
328        } else {
329            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
330        }
331    }
332
333    pub fn into_varbinview(self) -> VarBinViewArray {
334        if let Canonical::VarBinView(a) = self {
335            a
336        } else {
337            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
338        }
339    }
340
341    pub fn as_listview(&self) -> &ListViewArray {
342        if let Canonical::List(a) = self {
343            a
344        } else {
345            vortex_panic!("Cannot get ListArray from {:?}", &self)
346        }
347    }
348
349    pub fn into_listview(self) -> ListViewArray {
350        if let Canonical::List(a) = self {
351            a
352        } else {
353            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
354        }
355    }
356
357    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
358        if let Canonical::FixedSizeList(a) = self {
359            a
360        } else {
361            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
362        }
363    }
364
365    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
366        if let Canonical::FixedSizeList(a) = self {
367            a
368        } else {
369            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
370        }
371    }
372
373    pub fn as_struct(&self) -> &StructArray {
374        if let Canonical::Struct(a) = self {
375            a
376        } else {
377            vortex_panic!("Cannot get StructArray from {:?}", &self)
378        }
379    }
380
381    pub fn into_struct(self) -> StructArray {
382        if let Canonical::Struct(a) = self {
383            a
384        } else {
385            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
386        }
387    }
388
389    pub fn as_extension(&self) -> &ExtensionArray {
390        if let Canonical::Extension(a) = self {
391            a
392        } else {
393            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
394        }
395    }
396
397    pub fn into_extension(self) -> ExtensionArray {
398        if let Canonical::Extension(a) = self {
399            a
400        } else {
401            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
402        }
403    }
404}
405
406impl AsRef<dyn DynArray> for Canonical {
407    fn as_ref(&self) -> &(dyn DynArray + 'static) {
408        match_each_canonical!(self, |arr| arr.as_ref())
409    }
410}
411
412impl IntoArray for Canonical {
413    fn into_array(self) -> ArrayRef {
414        match_each_canonical!(self, |arr| arr.into_array())
415    }
416}
417
418/// Trait for types that can be converted from an owned type into an owned array variant.
419///
420/// # Canonicalization
421///
422/// This trait has a blanket implementation for all types implementing [ToCanonical].
423pub trait ToCanonical {
424    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
425    fn to_null(&self) -> NullArray;
426
427    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
428    fn to_bool(&self) -> BoolArray;
429
430    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
431    /// typed.
432    fn to_primitive(&self) -> PrimitiveArray;
433
434    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
435    /// typed.
436    fn to_decimal(&self) -> DecimalArray;
437
438    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
439    fn to_struct(&self) -> StructArray;
440
441    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
442    fn to_listview(&self) -> ListViewArray;
443
444    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
445    /// typed.
446    fn to_fixed_size_list(&self) -> FixedSizeListArray;
447
448    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
449    /// or [`Binary`](DType::Binary) typed.
450    fn to_varbinview(&self) -> VarBinViewArray;
451
452    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
453    /// typed.
454    fn to_extension(&self) -> ExtensionArray;
455}
456
457// Blanket impl for all Array encodings.
458impl<A: DynArray + ?Sized> ToCanonical for A {
459    fn to_null(&self) -> NullArray {
460        self.to_canonical()
461            .vortex_expect("to_canonical failed")
462            .into_null()
463    }
464
465    fn to_bool(&self) -> BoolArray {
466        self.to_canonical()
467            .vortex_expect("to_canonical failed")
468            .into_bool()
469    }
470
471    fn to_primitive(&self) -> PrimitiveArray {
472        self.to_canonical()
473            .vortex_expect("to_canonical failed")
474            .into_primitive()
475    }
476
477    fn to_decimal(&self) -> DecimalArray {
478        self.to_canonical()
479            .vortex_expect("to_canonical failed")
480            .into_decimal()
481    }
482
483    fn to_struct(&self) -> StructArray {
484        self.to_canonical()
485            .vortex_expect("to_canonical failed")
486            .into_struct()
487    }
488
489    fn to_listview(&self) -> ListViewArray {
490        self.to_canonical()
491            .vortex_expect("to_canonical failed")
492            .into_listview()
493    }
494
495    fn to_fixed_size_list(&self) -> FixedSizeListArray {
496        self.to_canonical()
497            .vortex_expect("to_canonical failed")
498            .into_fixed_size_list()
499    }
500
501    fn to_varbinview(&self) -> VarBinViewArray {
502        self.to_canonical()
503            .vortex_expect("to_canonical failed")
504            .into_varbinview()
505    }
506
507    fn to_extension(&self) -> ExtensionArray {
508        self.to_canonical()
509            .vortex_expect("to_canonical failed")
510            .into_extension()
511    }
512}
513
514impl From<Canonical> for ArrayRef {
515    fn from(value: Canonical) -> Self {
516        match_each_canonical!(value, |arr| arr.into_array())
517    }
518}
519
520/// Execute into [`Canonical`] by running `execute_until` with the [`AnyCanonical`] matcher.
521///
522/// Unlike executing into [`crate::Columnar`], this will fully expand constant arrays into their
523/// canonical form. Callers should prefer to execute into `Columnar` if they are able to optimize
524/// their use for constant arrays.
525impl Executable for Canonical {
526    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
527        let result = array.execute_until::<AnyCanonical>(ctx)?;
528        Ok(result
529            .as_opt::<AnyCanonical>()
530            .map(Canonical::from)
531            .vortex_expect("execute_until::<AnyCanonical> must return a canonical array"))
532    }
533}
534
535/// Recursively execute the array until it reaches canonical form along with its validity.
536///
537/// Callers should prefer to execute into `Columnar` instead of this specific target.
538/// This target is useful when preparing arrays for writing.
539pub struct CanonicalValidity(pub Canonical);
540
541impl Executable for CanonicalValidity {
542    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
543        match array.execute::<Canonical>(ctx)? {
544            n @ Canonical::Null(_) => Ok(CanonicalValidity(n)),
545            Canonical::Bool(b) => {
546                let BoolArrayParts {
547                    bits,
548                    offset,
549                    len,
550                    validity,
551                } = b.into_parts();
552                Ok(CanonicalValidity(Canonical::Bool(
553                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
554                )))
555            }
556            Canonical::Primitive(p) => {
557                let PrimitiveArrayParts {
558                    ptype,
559                    buffer,
560                    validity,
561                } = p.into_parts();
562                Ok(CanonicalValidity(Canonical::Primitive(unsafe {
563                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
564                })))
565            }
566            Canonical::Decimal(d) => {
567                let DecimalArrayParts {
568                    decimal_dtype,
569                    values,
570                    values_type,
571                    validity,
572                } = d.into_parts();
573                Ok(CanonicalValidity(Canonical::Decimal(unsafe {
574                    DecimalArray::new_unchecked_handle(
575                        values,
576                        values_type,
577                        decimal_dtype,
578                        validity.execute(ctx)?,
579                    )
580                })))
581            }
582            Canonical::VarBinView(vbv) => {
583                let VarBinViewArrayParts {
584                    dtype,
585                    buffers,
586                    views,
587                    validity,
588                } = vbv.into_parts();
589                Ok(CanonicalValidity(Canonical::VarBinView(unsafe {
590                    VarBinViewArray::new_handle_unchecked(
591                        views,
592                        buffers,
593                        dtype,
594                        validity.execute(ctx)?,
595                    )
596                })))
597            }
598            Canonical::List(l) => {
599                let zctl = l.is_zero_copy_to_list();
600                let ListViewArrayParts {
601                    elements,
602                    offsets,
603                    sizes,
604                    validity,
605                    ..
606                } = l.into_parts();
607                Ok(CanonicalValidity(Canonical::List(unsafe {
608                    ListViewArray::new_unchecked(elements, offsets, sizes, validity.execute(ctx)?)
609                        .with_zero_copy_to_list(zctl)
610                })))
611            }
612            Canonical::FixedSizeList(fsl) => {
613                let list_size = fsl.list_size();
614                let len = fsl.len();
615                let (elements, validity, _) = fsl.into_parts();
616                Ok(CanonicalValidity(Canonical::FixedSizeList(
617                    FixedSizeListArray::new(elements, list_size, validity.execute(ctx)?, len),
618                )))
619            }
620            Canonical::Struct(st) => {
621                let len = st.len();
622                let StructArrayParts {
623                    struct_fields,
624                    fields,
625                    validity,
626                } = st.into_parts();
627                Ok(CanonicalValidity(Canonical::Struct(unsafe {
628                    StructArray::new_unchecked(fields, struct_fields, len, validity.execute(ctx)?)
629                })))
630            }
631            Canonical::Extension(ext) => Ok(CanonicalValidity(Canonical::Extension(
632                ExtensionArray::new(
633                    ext.ext_dtype().clone(),
634                    ext.storage_array()
635                        .clone()
636                        .execute::<CanonicalValidity>(ctx)?
637                        .0
638                        .into_array(),
639                ),
640            ))),
641        }
642    }
643}
644
645/// Recursively execute the array until all of its children are canonical.
646///
647/// This method is useful to guarantee that all operators are fully executed,
648/// callers should prefer an execution target that's suitable for their use case instead of this one.
649pub struct RecursiveCanonical(pub Canonical);
650
651impl Executable for RecursiveCanonical {
652    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
653        match array.execute::<Canonical>(ctx)? {
654            n @ Canonical::Null(_) => Ok(RecursiveCanonical(n)),
655            Canonical::Bool(b) => {
656                let BoolArrayParts {
657                    bits,
658                    offset,
659                    len,
660                    validity,
661                } = b.into_parts();
662                Ok(RecursiveCanonical(Canonical::Bool(
663                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
664                )))
665            }
666            Canonical::Primitive(p) => {
667                let PrimitiveArrayParts {
668                    ptype,
669                    buffer,
670                    validity,
671                } = p.into_parts();
672                Ok(RecursiveCanonical(Canonical::Primitive(unsafe {
673                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
674                })))
675            }
676            Canonical::Decimal(d) => {
677                let DecimalArrayParts {
678                    decimal_dtype,
679                    values,
680                    values_type,
681                    validity,
682                } = d.into_parts();
683                Ok(RecursiveCanonical(Canonical::Decimal(unsafe {
684                    DecimalArray::new_unchecked_handle(
685                        values,
686                        values_type,
687                        decimal_dtype,
688                        validity.execute(ctx)?,
689                    )
690                })))
691            }
692            Canonical::VarBinView(vbv) => {
693                let VarBinViewArrayParts {
694                    dtype,
695                    buffers,
696                    views,
697                    validity,
698                } = vbv.into_parts();
699                Ok(RecursiveCanonical(Canonical::VarBinView(unsafe {
700                    VarBinViewArray::new_handle_unchecked(
701                        views,
702                        buffers,
703                        dtype,
704                        validity.execute(ctx)?,
705                    )
706                })))
707            }
708            Canonical::List(l) => {
709                let zctl = l.is_zero_copy_to_list();
710                let ListViewArrayParts {
711                    elements,
712                    offsets,
713                    sizes,
714                    validity,
715                    ..
716                } = l.into_parts();
717                Ok(RecursiveCanonical(Canonical::List(unsafe {
718                    ListViewArray::new_unchecked(
719                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
720                        offsets.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
721                        sizes.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
722                        validity.execute(ctx)?,
723                    )
724                    .with_zero_copy_to_list(zctl)
725                })))
726            }
727            Canonical::FixedSizeList(fsl) => {
728                let list_size = fsl.list_size();
729                let len = fsl.len();
730                let (elements, validity, _) = fsl.into_parts();
731                Ok(RecursiveCanonical(Canonical::FixedSizeList(
732                    FixedSizeListArray::new(
733                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
734                        list_size,
735                        validity.execute(ctx)?,
736                        len,
737                    ),
738                )))
739            }
740            Canonical::Struct(st) => {
741                let len = st.len();
742                let StructArrayParts {
743                    struct_fields,
744                    fields,
745                    validity,
746                } = st.into_parts();
747                let executed_fields = fields
748                    .iter()
749                    .map(|f| Ok(f.clone().execute::<RecursiveCanonical>(ctx)?.0.into_array()))
750                    .collect::<VortexResult<Arc<[_]>>>()?;
751
752                Ok(RecursiveCanonical(Canonical::Struct(unsafe {
753                    StructArray::new_unchecked(
754                        executed_fields,
755                        struct_fields,
756                        len,
757                        validity.execute(ctx)?,
758                    )
759                })))
760            }
761            Canonical::Extension(ext) => Ok(RecursiveCanonical(Canonical::Extension(
762                ExtensionArray::new(
763                    ext.ext_dtype().clone(),
764                    ext.storage_array()
765                        .clone()
766                        .execute::<RecursiveCanonical>(ctx)?
767                        .0
768                        .into_array(),
769                ),
770            ))),
771        }
772    }
773}
774
775/// Execute a primitive typed array into a buffer of native values, assuming all values are valid.
776///
777/// # Errors
778///
779/// Returns a `VortexError` if the array is not all-valid (has any nulls).
780impl<T: NativePType> Executable for Buffer<T> {
781    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
782        let array = PrimitiveArray::execute(array, ctx)?;
783        vortex_ensure!(
784            array.all_valid()?,
785            "Cannot execute to native buffer: array is not all-valid."
786        );
787        Ok(array.into_buffer())
788    }
789}
790
791/// Execute the array to canonical form and unwrap as a [`PrimitiveArray`].
792///
793/// This will panic if the array's dtype is not primitive.
794impl Executable for PrimitiveArray {
795    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
796        match array.try_into::<Primitive>() {
797            Ok(primitive) => Ok(primitive),
798            Err(array) => Ok(Canonical::execute(array, ctx)?.into_primitive()),
799        }
800    }
801}
802
803/// Execute the array to canonical form and unwrap as a [`BoolArray`].
804///
805/// This will panic if the array's dtype is not bool.
806impl Executable for BoolArray {
807    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
808        match array.try_into::<Bool>() {
809            Ok(bool_array) => Ok(bool_array),
810            Err(array) => Ok(Canonical::execute(array, ctx)?.into_bool()),
811        }
812    }
813}
814
815/// Execute the array to a [`BitBuffer`], aka a non-nullable  [`BoolArray`].
816///
817/// This will panic if the array's dtype is not non-nullable bool.
818impl Executable for BitBuffer {
819    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
820        let bool = BoolArray::execute(array, ctx)?;
821        assert!(
822            !bool.dtype().is_nullable(),
823            "bit buffer execute only works with non-nullable bool arrays"
824        );
825        Ok(bool.into_bit_buffer())
826    }
827}
828
829/// Execute the array to canonical form and unwrap as a [`NullArray`].
830///
831/// This will panic if the array's dtype is not null.
832impl Executable for NullArray {
833    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
834        match array.try_into::<Null>() {
835            Ok(null_array) => Ok(null_array),
836            Err(array) => Ok(Canonical::execute(array, ctx)?.into_null()),
837        }
838    }
839}
840
841/// Execute the array to canonical form and unwrap as a [`VarBinViewArray`].
842///
843/// This will panic if the array's dtype is not utf8 or binary.
844impl Executable for VarBinViewArray {
845    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
846        match array.try_into::<VarBinView>() {
847            Ok(varbinview) => Ok(varbinview),
848            Err(array) => Ok(Canonical::execute(array, ctx)?.into_varbinview()),
849        }
850    }
851}
852
853/// Execute the array to canonical form and unwrap as an [`ExtensionArray`].
854///
855/// This will panic if the array's dtype is not an extension type.
856impl Executable for ExtensionArray {
857    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
858        match array.try_into::<Extension>() {
859            Ok(ext_array) => Ok(ext_array),
860            Err(array) => Ok(Canonical::execute(array, ctx)?.into_extension()),
861        }
862    }
863}
864
865/// Execute the array to canonical form and unwrap as a [`DecimalArray`].
866///
867/// This will panic if the array's dtype is not decimal.
868impl Executable for DecimalArray {
869    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
870        match array.try_into::<Decimal>() {
871            Ok(decimal) => Ok(decimal),
872            Err(array) => Ok(Canonical::execute(array, ctx)?.into_decimal()),
873        }
874    }
875}
876
877/// Execute the array to canonical form and unwrap as a [`ListViewArray`].
878///
879/// This will panic if the array's dtype is not list.
880impl Executable for ListViewArray {
881    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
882        match array.try_into::<ListView>() {
883            Ok(list) => Ok(list),
884            Err(array) => Ok(Canonical::execute(array, ctx)?.into_listview()),
885        }
886    }
887}
888
889/// Execute the array to canonical form and unwrap as a [`FixedSizeListArray`].
890///
891/// This will panic if the array's dtype is not fixed size list.
892impl Executable for FixedSizeListArray {
893    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
894        match array.try_into::<FixedSizeList>() {
895            Ok(fsl) => Ok(fsl),
896            Err(array) => Ok(Canonical::execute(array, ctx)?.into_fixed_size_list()),
897        }
898    }
899}
900
901/// Execute the array to canonical form and unwrap as a [`StructArray`].
902///
903/// This will panic if the array's dtype is not struct.
904impl Executable for StructArray {
905    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
906        match array.try_into::<Struct>() {
907            Ok(struct_array) => Ok(struct_array),
908            Err(array) => Ok(Canonical::execute(array, ctx)?.into_struct()),
909        }
910    }
911}
912
913/// A view into a canonical array type.
914#[derive(Debug, Clone)]
915pub enum CanonicalView<'a> {
916    Null(&'a NullArray),
917    Bool(&'a BoolArray),
918    Primitive(&'a PrimitiveArray),
919    Decimal(&'a DecimalArray),
920    VarBinView(&'a VarBinViewArray),
921    List(&'a ListViewArray),
922    FixedSizeList(&'a FixedSizeListArray),
923    Struct(&'a StructArray),
924    Extension(&'a ExtensionArray),
925}
926
927impl From<CanonicalView<'_>> for Canonical {
928    fn from(value: CanonicalView<'_>) -> Self {
929        match value {
930            CanonicalView::Null(a) => Canonical::Null(a.clone()),
931            CanonicalView::Bool(a) => Canonical::Bool(a.clone()),
932            CanonicalView::Primitive(a) => Canonical::Primitive(a.clone()),
933            CanonicalView::Decimal(a) => Canonical::Decimal(a.clone()),
934            CanonicalView::VarBinView(a) => Canonical::VarBinView(a.clone()),
935            CanonicalView::List(a) => Canonical::List(a.clone()),
936            CanonicalView::FixedSizeList(a) => Canonical::FixedSizeList(a.clone()),
937            CanonicalView::Struct(a) => Canonical::Struct(a.clone()),
938            CanonicalView::Extension(a) => Canonical::Extension(a.clone()),
939        }
940    }
941}
942
943impl AsRef<dyn DynArray> for CanonicalView<'_> {
944    fn as_ref(&self) -> &dyn DynArray {
945        match self {
946            CanonicalView::Null(a) => a.as_ref(),
947            CanonicalView::Bool(a) => a.as_ref(),
948            CanonicalView::Primitive(a) => a.as_ref(),
949            CanonicalView::Decimal(a) => a.as_ref(),
950            CanonicalView::VarBinView(a) => a.as_ref(),
951            CanonicalView::List(a) => a.as_ref(),
952            CanonicalView::FixedSizeList(a) => a.as_ref(),
953            CanonicalView::Struct(a) => a.as_ref(),
954            CanonicalView::Extension(a) => a.as_ref(),
955        }
956    }
957}
958
959/// A matcher for any canonical array type.
960pub struct AnyCanonical;
961impl Matcher for AnyCanonical {
962    type Match<'a> = CanonicalView<'a>;
963
964    fn matches(array: &dyn DynArray) -> bool {
965        array.is::<Null>()
966            || array.is::<Bool>()
967            || array.is::<Primitive>()
968            || array.is::<Decimal>()
969            || array.is::<Struct>()
970            || array.is::<ListView>()
971            || array.is::<FixedSizeList>()
972            || array.is::<VarBinView>()
973            || array.is::<Extension>()
974    }
975
976    fn try_match<'a>(array: &'a dyn DynArray) -> Option<Self::Match<'a>> {
977        if let Some(a) = array.as_opt::<Null>() {
978            Some(CanonicalView::Null(a))
979        } else if let Some(a) = array.as_opt::<Bool>() {
980            Some(CanonicalView::Bool(a))
981        } else if let Some(a) = array.as_opt::<Primitive>() {
982            Some(CanonicalView::Primitive(a))
983        } else if let Some(a) = array.as_opt::<Decimal>() {
984            Some(CanonicalView::Decimal(a))
985        } else if let Some(a) = array.as_opt::<Struct>() {
986            Some(CanonicalView::Struct(a))
987        } else if let Some(a) = array.as_opt::<ListView>() {
988            Some(CanonicalView::List(a))
989        } else if let Some(a) = array.as_opt::<FixedSizeList>() {
990            Some(CanonicalView::FixedSizeList(a))
991        } else if let Some(a) = array.as_opt::<VarBinView>() {
992            Some(CanonicalView::VarBinView(a))
993        } else {
994            array.as_opt::<Extension>().map(CanonicalView::Extension)
995        }
996    }
997}
998
999#[cfg(test)]
1000mod test {
1001    use std::sync::Arc;
1002
1003    use arrow_array::Array as ArrowArray;
1004    use arrow_array::ArrayRef as ArrowArrayRef;
1005    use arrow_array::ListArray as ArrowListArray;
1006    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
1007    use arrow_array::StringArray;
1008    use arrow_array::StringViewArray;
1009    use arrow_array::StructArray as ArrowStructArray;
1010    use arrow_array::cast::AsArray;
1011    use arrow_array::types::Int32Type;
1012    use arrow_array::types::Int64Type;
1013    use arrow_array::types::UInt64Type;
1014    use arrow_buffer::NullBufferBuilder;
1015    use arrow_buffer::OffsetBuffer;
1016    use arrow_schema::DataType;
1017    use arrow_schema::Field;
1018    use vortex_buffer::buffer;
1019
1020    use crate::ArrayRef;
1021    use crate::IntoArray;
1022    use crate::arrays::ConstantArray;
1023    use crate::arrow::FromArrowArray;
1024    use crate::arrow::IntoArrowArray;
1025    use crate::canonical::StructArray;
1026
1027    #[test]
1028    fn test_canonicalize_nested_struct() {
1029        // Create a struct array with multiple internal components.
1030        let nested_struct_array = StructArray::from_fields(&[
1031            ("a", buffer![1u64].into_array()),
1032            (
1033                "b",
1034                StructArray::from_fields(&[(
1035                    "inner_a",
1036                    // The nested struct contains a ConstantArray representing the primitive array
1037                    //   [100i64]
1038                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
1039                    // map this to the nearest canonical type (PrimitiveArray).
1040                    ConstantArray::new(100i64, 1).into_array(),
1041                )])
1042                .unwrap()
1043                .into_array(),
1044            ),
1045        ])
1046        .unwrap();
1047
1048        let arrow_struct = nested_struct_array
1049            .into_array()
1050            .into_arrow_preferred()
1051            .unwrap()
1052            .as_any()
1053            .downcast_ref::<ArrowStructArray>()
1054            .cloned()
1055            .unwrap();
1056
1057        assert!(
1058            arrow_struct
1059                .column(0)
1060                .as_any()
1061                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1062                .is_some()
1063        );
1064
1065        let inner_struct = arrow_struct
1066            .column(1)
1067            .clone()
1068            .as_any()
1069            .downcast_ref::<ArrowStructArray>()
1070            .cloned()
1071            .unwrap();
1072
1073        let inner_a = inner_struct
1074            .column(0)
1075            .as_any()
1076            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1077        assert!(inner_a.is_some());
1078
1079        assert_eq!(
1080            inner_a.cloned().unwrap(),
1081            ArrowPrimitiveArray::from_iter([100i64])
1082        );
1083    }
1084
1085    #[test]
1086    fn roundtrip_struct() {
1087        let mut nulls = NullBufferBuilder::new(6);
1088        nulls.append_n_non_nulls(4);
1089        nulls.append_null();
1090        nulls.append_non_null();
1091        let names = Arc::new(StringViewArray::from_iter(vec![
1092            Some("Joseph"),
1093            None,
1094            Some("Angela"),
1095            Some("Mikhail"),
1096            None,
1097            None,
1098        ]));
1099        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1100            Some(25),
1101            Some(31),
1102            None,
1103            Some(57),
1104            None,
1105            None,
1106        ]));
1107
1108        let arrow_struct = ArrowStructArray::new(
1109            vec![
1110                Arc::new(Field::new("name", DataType::Utf8View, true)),
1111                Arc::new(Field::new("age", DataType::Int32, true)),
1112            ]
1113            .into(),
1114            vec![names, ages],
1115            nulls.finish(),
1116        );
1117
1118        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true).unwrap();
1119
1120        assert_eq!(
1121            &arrow_struct,
1122            vortex_struct.into_arrow_preferred().unwrap().as_struct()
1123        );
1124    }
1125
1126    #[test]
1127    fn roundtrip_list() {
1128        let names = Arc::new(StringArray::from_iter(vec![
1129            Some("Joseph"),
1130            Some("Angela"),
1131            Some("Mikhail"),
1132        ]));
1133
1134        let arrow_list = ArrowListArray::new(
1135            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1136            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1137            names,
1138            None,
1139        );
1140        let list_data_type = arrow_list.data_type();
1141
1142        let vortex_list = ArrayRef::from_arrow(&arrow_list, true).unwrap();
1143
1144        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
1145
1146        assert_eq!(
1147            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1148            rt_arrow_list.as_ref()
1149        );
1150    }
1151}