Skip to main content

vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use std::sync::Arc;
7
8use vortex_buffer::BitBuffer;
9use vortex_buffer::Buffer;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_panic;
14
15use crate::ArrayRef;
16use crate::DynArray;
17use crate::Executable;
18use crate::ExecutionCtx;
19use crate::IntoArray;
20use crate::arrays::Bool;
21use crate::arrays::BoolArray;
22use crate::arrays::Decimal;
23use crate::arrays::DecimalArray;
24use crate::arrays::Extension;
25use crate::arrays::ExtensionArray;
26use crate::arrays::FixedSizeList;
27use crate::arrays::FixedSizeListArray;
28use crate::arrays::ListView;
29use crate::arrays::ListViewArray;
30use crate::arrays::Null;
31use crate::arrays::NullArray;
32use crate::arrays::Primitive;
33use crate::arrays::PrimitiveArray;
34use crate::arrays::Struct;
35use crate::arrays::StructArray;
36use crate::arrays::VarBinView;
37use crate::arrays::VarBinViewArray;
38use crate::arrays::Variant;
39use crate::arrays::VariantArray;
40use crate::arrays::bool::BoolArrayParts;
41use crate::arrays::decimal::DecimalArrayParts;
42use crate::arrays::listview::ListViewArrayParts;
43use crate::arrays::listview::ListViewRebuildMode;
44use crate::arrays::primitive::PrimitiveArrayParts;
45use crate::arrays::struct_::StructArrayParts;
46use crate::arrays::varbinview::VarBinViewArrayParts;
47use crate::dtype::DType;
48use crate::dtype::NativePType;
49use crate::dtype::Nullability;
50use crate::dtype::PType;
51use crate::match_each_decimal_value_type;
52use crate::match_each_native_ptype;
53use crate::matcher::Matcher;
54use crate::validity::Validity;
55
56/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
57///
58/// Any array can be decoded into canonical form via the [`to_canonical`](DynArray::to_canonical)
59/// trait method. This is the simplest encoding for a type, and will not be compressed but may
60/// contain compressed child arrays.
61///
62/// Canonical form is useful for doing type-specific compute where you need to know that all
63/// elements are laid out decompressed and contiguous in memory.
64///
65/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
66/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
67/// [`DType::Binary`].
68///
69/// # Laziness
70///
71/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
72/// `Struct` type, individual column child arrays may still be compressed. This allows
73/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
74/// never need to be decoded into canonical form at all depending on the compute.
75///
76/// # Arrow interoperability
77///
78/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
79/// zero-copy, and the corresponding Arrow array types can also be built directly.
80///
81/// The full list of canonical types and their equivalent Arrow array types are:
82///
83/// * `NullArray`: [`arrow_array::NullArray`]
84/// * `BoolArray`: [`arrow_array::BooleanArray`]
85/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
86/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
87/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
88/// * `ListViewArray`: [`arrow_array::ListViewArray`]
89/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
90/// * `StructArray`: [`arrow_array::StructArray`]
91///
92/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
93/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
94/// create ambiguity.
95/// Thus, if you receive an Arrow array, compress it using Vortex, and then
96/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
97/// variants to hold the data.
98///
99/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
100/// will correspond to an arrow-rs [`arrow_schema::DataType`].
101///
102/// # Views support
103///
104/// Binary and String views, also known as "German strings" are a better encoding format for
105/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
106/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
107/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
108/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
109/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
110/// from memory.
111///
112/// # For Developers
113///
114/// If you add another variant to this enum, make sure to update `dyn Array::is_canonical`,
115/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
116#[derive(Debug, Clone)]
117pub enum Canonical {
118    Null(NullArray),
119    Bool(BoolArray),
120    Primitive(PrimitiveArray),
121    Decimal(DecimalArray),
122    VarBinView(VarBinViewArray),
123    List(ListViewArray),
124    FixedSizeList(FixedSizeListArray),
125    Struct(StructArray),
126    Extension(ExtensionArray),
127    Variant(VariantArray),
128}
129
130/// Match on every canonical variant and evaluate a code block on all variants
131macro_rules! match_each_canonical {
132    ($self:expr, | $ident:ident | $eval:expr) => {{
133        match $self {
134            Canonical::Null($ident) => $eval,
135            Canonical::Bool($ident) => $eval,
136            Canonical::Primitive($ident) => $eval,
137            Canonical::Decimal($ident) => $eval,
138            Canonical::VarBinView($ident) => $eval,
139            Canonical::List($ident) => $eval,
140            Canonical::FixedSizeList($ident) => $eval,
141            Canonical::Struct($ident) => $eval,
142            Canonical::Variant($ident) => $eval,
143            Canonical::Extension($ident) => $eval,
144        }
145    }};
146}
147
148impl Canonical {
149    /// Create an empty canonical array of the given dtype.
150    pub fn empty(dtype: &DType) -> Canonical {
151        match dtype {
152            DType::Null => Canonical::Null(NullArray::new(0)),
153            DType::Bool(n) => Canonical::Bool(unsafe {
154                BoolArray::new_unchecked(BitBuffer::empty(), Validity::from(n))
155            }),
156            DType::Primitive(ptype, n) => {
157                match_each_native_ptype!(ptype, |P| {
158                    Canonical::Primitive(unsafe {
159                        PrimitiveArray::new_unchecked(Buffer::<P>::empty(), Validity::from(n))
160                    })
161                })
162            }
163            DType::Decimal(decimal_type, n) => {
164                match_each_decimal_value_type!(
165                    DecimalType::smallest_decimal_value_type(decimal_type),
166                    |D| {
167                        Canonical::Decimal(unsafe {
168                            DecimalArray::new_unchecked::<D>(
169                                Buffer::empty(),
170                                *decimal_type,
171                                Validity::from(n),
172                            )
173                        })
174                    }
175                )
176            }
177            DType::Utf8(n) => Canonical::VarBinView(unsafe {
178                VarBinViewArray::new_unchecked(
179                    Buffer::empty(),
180                    Arc::new([]),
181                    dtype.clone(),
182                    Validity::from(n),
183                )
184            }),
185            DType::Binary(n) => Canonical::VarBinView(unsafe {
186                VarBinViewArray::new_unchecked(
187                    Buffer::empty(),
188                    Arc::new([]),
189                    dtype.clone(),
190                    Validity::from(n),
191                )
192            }),
193            DType::Struct(struct_dtype, n) => Canonical::Struct(unsafe {
194                StructArray::new_unchecked(
195                    struct_dtype
196                        .fields()
197                        .map(|f| Canonical::empty(&f).into_array())
198                        .collect::<Arc<[_]>>(),
199                    struct_dtype.clone(),
200                    0,
201                    Validity::from(n),
202                )
203            }),
204            DType::List(dtype, n) => Canonical::List(unsafe {
205                ListViewArray::new_unchecked(
206                    Canonical::empty(dtype).into_array(),
207                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
208                        .into_array(),
209                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
210                        .into_array(),
211                    Validity::from(n),
212                )
213                // An empty list view is trivially copyable to a list.
214                .with_zero_copy_to_list(true)
215            }),
216            DType::FixedSizeList(elem_dtype, list_size, null) => Canonical::FixedSizeList(unsafe {
217                FixedSizeListArray::new_unchecked(
218                    Canonical::empty(elem_dtype).into_array(),
219                    *list_size,
220                    Validity::from(null),
221                    0,
222                )
223            }),
224            DType::Extension(ext_dtype) => Canonical::Extension(ExtensionArray::new(
225                ext_dtype.clone(),
226                Canonical::empty(ext_dtype.storage_dtype()).into_array(),
227            )),
228            DType::Variant(_) => {
229                vortex_panic!(InvalidArgument: "Canonical empty is not supported for Variant")
230            }
231        }
232    }
233
234    pub fn len(&self) -> usize {
235        match_each_canonical!(self, |arr| arr.len())
236    }
237
238    pub fn dtype(&self) -> &DType {
239        match_each_canonical!(self, |arr| arr.dtype())
240    }
241
242    pub fn is_empty(&self) -> bool {
243        match_each_canonical!(self, |arr| arr.is_empty())
244    }
245}
246
247impl Canonical {
248    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
249    ///
250    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
251    /// they can accumulate wasted space after slicing and taking operations.
252    ///
253    /// This operation is very expensive and can result in things like allocations, full-scans
254    /// and copy operations.
255    pub fn compact(&self) -> VortexResult<Canonical> {
256        match self {
257            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
258            Canonical::List(array) => Ok(Canonical::List(
259                array.rebuild(ListViewRebuildMode::TrimElements)?,
260            )),
261            _ => Ok(self.clone()),
262        }
263    }
264}
265
266// Unwrap canonical type back down to specialized type.
267impl Canonical {
268    pub fn as_null(&self) -> &NullArray {
269        if let Canonical::Null(a) = self {
270            a
271        } else {
272            vortex_panic!("Cannot get NullArray from {:?}", &self)
273        }
274    }
275
276    pub fn into_null(self) -> NullArray {
277        if let Canonical::Null(a) = self {
278            a
279        } else {
280            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
281        }
282    }
283
284    pub fn as_bool(&self) -> &BoolArray {
285        if let Canonical::Bool(a) = self {
286            a
287        } else {
288            vortex_panic!("Cannot get BoolArray from {:?}", &self)
289        }
290    }
291
292    pub fn into_bool(self) -> BoolArray {
293        if let Canonical::Bool(a) = self {
294            a
295        } else {
296            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
297        }
298    }
299
300    pub fn as_primitive(&self) -> &PrimitiveArray {
301        if let Canonical::Primitive(a) = self {
302            a
303        } else {
304            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
305        }
306    }
307
308    pub fn into_primitive(self) -> PrimitiveArray {
309        if let Canonical::Primitive(a) = self {
310            a
311        } else {
312            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
313        }
314    }
315
316    pub fn as_decimal(&self) -> &DecimalArray {
317        if let Canonical::Decimal(a) = self {
318            a
319        } else {
320            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
321        }
322    }
323
324    pub fn into_decimal(self) -> DecimalArray {
325        if let Canonical::Decimal(a) = self {
326            a
327        } else {
328            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
329        }
330    }
331
332    pub fn as_varbinview(&self) -> &VarBinViewArray {
333        if let Canonical::VarBinView(a) = self {
334            a
335        } else {
336            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
337        }
338    }
339
340    pub fn into_varbinview(self) -> VarBinViewArray {
341        if let Canonical::VarBinView(a) = self {
342            a
343        } else {
344            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
345        }
346    }
347
348    pub fn as_listview(&self) -> &ListViewArray {
349        if let Canonical::List(a) = self {
350            a
351        } else {
352            vortex_panic!("Cannot get ListArray from {:?}", &self)
353        }
354    }
355
356    pub fn into_listview(self) -> ListViewArray {
357        if let Canonical::List(a) = self {
358            a
359        } else {
360            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
361        }
362    }
363
364    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
365        if let Canonical::FixedSizeList(a) = self {
366            a
367        } else {
368            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
369        }
370    }
371
372    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
373        if let Canonical::FixedSizeList(a) = self {
374            a
375        } else {
376            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
377        }
378    }
379
380    pub fn as_struct(&self) -> &StructArray {
381        if let Canonical::Struct(a) = self {
382            a
383        } else {
384            vortex_panic!("Cannot get StructArray from {:?}", &self)
385        }
386    }
387
388    pub fn into_struct(self) -> StructArray {
389        if let Canonical::Struct(a) = self {
390            a
391        } else {
392            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
393        }
394    }
395
396    pub fn as_extension(&self) -> &ExtensionArray {
397        if let Canonical::Extension(a) = self {
398            a
399        } else {
400            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
401        }
402    }
403
404    pub fn into_extension(self) -> ExtensionArray {
405        if let Canonical::Extension(a) = self {
406            a
407        } else {
408            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
409        }
410    }
411}
412
413impl AsRef<dyn DynArray> for Canonical {
414    fn as_ref(&self) -> &(dyn DynArray + 'static) {
415        match_each_canonical!(self, |arr| arr.as_ref())
416    }
417}
418
419impl IntoArray for Canonical {
420    fn into_array(self) -> ArrayRef {
421        match_each_canonical!(self, |arr| arr.into_array())
422    }
423}
424
425/// Trait for types that can be converted from an owned type into an owned array variant.
426///
427/// # Canonicalization
428///
429/// This trait has a blanket implementation for all types implementing [ToCanonical].
430pub trait ToCanonical {
431    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
432    fn to_null(&self) -> NullArray;
433
434    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
435    fn to_bool(&self) -> BoolArray;
436
437    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
438    /// typed.
439    fn to_primitive(&self) -> PrimitiveArray;
440
441    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
442    /// typed.
443    fn to_decimal(&self) -> DecimalArray;
444
445    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
446    fn to_struct(&self) -> StructArray;
447
448    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
449    fn to_listview(&self) -> ListViewArray;
450
451    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
452    /// typed.
453    fn to_fixed_size_list(&self) -> FixedSizeListArray;
454
455    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
456    /// or [`Binary`](DType::Binary) typed.
457    fn to_varbinview(&self) -> VarBinViewArray;
458
459    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
460    /// typed.
461    fn to_extension(&self) -> ExtensionArray;
462}
463
464// Blanket impl for all Array encodings.
465impl<A: DynArray + ?Sized> ToCanonical for A {
466    fn to_null(&self) -> NullArray {
467        self.to_canonical()
468            .vortex_expect("to_canonical failed")
469            .into_null()
470    }
471
472    fn to_bool(&self) -> BoolArray {
473        self.to_canonical()
474            .vortex_expect("to_canonical failed")
475            .into_bool()
476    }
477
478    fn to_primitive(&self) -> PrimitiveArray {
479        self.to_canonical()
480            .vortex_expect("to_canonical failed")
481            .into_primitive()
482    }
483
484    fn to_decimal(&self) -> DecimalArray {
485        self.to_canonical()
486            .vortex_expect("to_canonical failed")
487            .into_decimal()
488    }
489
490    fn to_struct(&self) -> StructArray {
491        self.to_canonical()
492            .vortex_expect("to_canonical failed")
493            .into_struct()
494    }
495
496    fn to_listview(&self) -> ListViewArray {
497        self.to_canonical()
498            .vortex_expect("to_canonical failed")
499            .into_listview()
500    }
501
502    fn to_fixed_size_list(&self) -> FixedSizeListArray {
503        self.to_canonical()
504            .vortex_expect("to_canonical failed")
505            .into_fixed_size_list()
506    }
507
508    fn to_varbinview(&self) -> VarBinViewArray {
509        self.to_canonical()
510            .vortex_expect("to_canonical failed")
511            .into_varbinview()
512    }
513
514    fn to_extension(&self) -> ExtensionArray {
515        self.to_canonical()
516            .vortex_expect("to_canonical failed")
517            .into_extension()
518    }
519}
520
521impl From<Canonical> for ArrayRef {
522    fn from(value: Canonical) -> Self {
523        match_each_canonical!(value, |arr| arr.into_array())
524    }
525}
526
527/// Execute into [`Canonical`] by running `execute_until` with the [`AnyCanonical`] matcher.
528///
529/// Unlike executing into [`crate::Columnar`], this will fully expand constant arrays into their
530/// canonical form. Callers should prefer to execute into `Columnar` if they are able to optimize
531/// their use for constant arrays.
532impl Executable for Canonical {
533    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
534        let result = array.execute_until::<AnyCanonical>(ctx)?;
535        Ok(result
536            .as_opt::<AnyCanonical>()
537            .map(Canonical::from)
538            .vortex_expect("execute_until::<AnyCanonical> must return a canonical array"))
539    }
540}
541
542/// Recursively execute the array until it reaches canonical form along with its validity.
543///
544/// Callers should prefer to execute into `Columnar` instead of this specific target.
545/// This target is useful when preparing arrays for writing.
546pub struct CanonicalValidity(pub Canonical);
547
548impl Executable for CanonicalValidity {
549    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
550        match array.execute::<Canonical>(ctx)? {
551            n @ Canonical::Null(_) => Ok(CanonicalValidity(n)),
552            Canonical::Bool(b) => {
553                let BoolArrayParts {
554                    bits,
555                    offset,
556                    len,
557                    validity,
558                } = b.into_parts();
559                Ok(CanonicalValidity(Canonical::Bool(
560                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
561                )))
562            }
563            Canonical::Primitive(p) => {
564                let PrimitiveArrayParts {
565                    ptype,
566                    buffer,
567                    validity,
568                } = p.into_parts();
569                Ok(CanonicalValidity(Canonical::Primitive(unsafe {
570                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
571                })))
572            }
573            Canonical::Decimal(d) => {
574                let DecimalArrayParts {
575                    decimal_dtype,
576                    values,
577                    values_type,
578                    validity,
579                } = d.into_parts();
580                Ok(CanonicalValidity(Canonical::Decimal(unsafe {
581                    DecimalArray::new_unchecked_handle(
582                        values,
583                        values_type,
584                        decimal_dtype,
585                        validity.execute(ctx)?,
586                    )
587                })))
588            }
589            Canonical::VarBinView(vbv) => {
590                let VarBinViewArrayParts {
591                    dtype,
592                    buffers,
593                    views,
594                    validity,
595                } = vbv.into_parts();
596                Ok(CanonicalValidity(Canonical::VarBinView(unsafe {
597                    VarBinViewArray::new_handle_unchecked(
598                        views,
599                        buffers,
600                        dtype,
601                        validity.execute(ctx)?,
602                    )
603                })))
604            }
605            Canonical::List(l) => {
606                let zctl = l.is_zero_copy_to_list();
607                let ListViewArrayParts {
608                    elements,
609                    offsets,
610                    sizes,
611                    validity,
612                    ..
613                } = l.into_parts();
614                Ok(CanonicalValidity(Canonical::List(unsafe {
615                    ListViewArray::new_unchecked(elements, offsets, sizes, validity.execute(ctx)?)
616                        .with_zero_copy_to_list(zctl)
617                })))
618            }
619            Canonical::FixedSizeList(fsl) => {
620                let list_size = fsl.list_size();
621                let len = fsl.len();
622                let (elements, validity, _) = fsl.into_parts();
623                Ok(CanonicalValidity(Canonical::FixedSizeList(
624                    FixedSizeListArray::new(elements, list_size, validity.execute(ctx)?, len),
625                )))
626            }
627            Canonical::Struct(st) => {
628                let len = st.len();
629                let StructArrayParts {
630                    struct_fields,
631                    fields,
632                    validity,
633                } = st.into_parts();
634                Ok(CanonicalValidity(Canonical::Struct(unsafe {
635                    StructArray::new_unchecked(fields, struct_fields, len, validity.execute(ctx)?)
636                })))
637            }
638            Canonical::Extension(ext) => Ok(CanonicalValidity(Canonical::Extension(
639                ExtensionArray::new(
640                    ext.ext_dtype().clone(),
641                    ext.storage_array()
642                        .clone()
643                        .execute::<CanonicalValidity>(ctx)?
644                        .0
645                        .into_array(),
646                ),
647            ))),
648            Canonical::Variant(variant) => {
649                Ok(CanonicalValidity(Canonical::Variant(VariantArray::new(
650                    variant
651                        .child()
652                        .clone()
653                        .execute::<CanonicalValidity>(ctx)?
654                        .0
655                        .into_array(),
656                ))))
657            }
658        }
659    }
660}
661
662/// Recursively execute the array until all of its children are canonical.
663///
664/// This method is useful to guarantee that all operators are fully executed,
665/// callers should prefer an execution target that's suitable for their use case instead of this one.
666pub struct RecursiveCanonical(pub Canonical);
667
668impl Executable for RecursiveCanonical {
669    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
670        match array.execute::<Canonical>(ctx)? {
671            n @ Canonical::Null(_) => Ok(RecursiveCanonical(n)),
672            Canonical::Bool(b) => {
673                let BoolArrayParts {
674                    bits,
675                    offset,
676                    len,
677                    validity,
678                } = b.into_parts();
679                Ok(RecursiveCanonical(Canonical::Bool(
680                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
681                )))
682            }
683            Canonical::Primitive(p) => {
684                let PrimitiveArrayParts {
685                    ptype,
686                    buffer,
687                    validity,
688                } = p.into_parts();
689                Ok(RecursiveCanonical(Canonical::Primitive(unsafe {
690                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
691                })))
692            }
693            Canonical::Decimal(d) => {
694                let DecimalArrayParts {
695                    decimal_dtype,
696                    values,
697                    values_type,
698                    validity,
699                } = d.into_parts();
700                Ok(RecursiveCanonical(Canonical::Decimal(unsafe {
701                    DecimalArray::new_unchecked_handle(
702                        values,
703                        values_type,
704                        decimal_dtype,
705                        validity.execute(ctx)?,
706                    )
707                })))
708            }
709            Canonical::VarBinView(vbv) => {
710                let VarBinViewArrayParts {
711                    dtype,
712                    buffers,
713                    views,
714                    validity,
715                } = vbv.into_parts();
716                Ok(RecursiveCanonical(Canonical::VarBinView(unsafe {
717                    VarBinViewArray::new_handle_unchecked(
718                        views,
719                        buffers,
720                        dtype,
721                        validity.execute(ctx)?,
722                    )
723                })))
724            }
725            Canonical::List(l) => {
726                let zctl = l.is_zero_copy_to_list();
727                let ListViewArrayParts {
728                    elements,
729                    offsets,
730                    sizes,
731                    validity,
732                    ..
733                } = l.into_parts();
734                Ok(RecursiveCanonical(Canonical::List(unsafe {
735                    ListViewArray::new_unchecked(
736                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
737                        offsets.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
738                        sizes.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
739                        validity.execute(ctx)?,
740                    )
741                    .with_zero_copy_to_list(zctl)
742                })))
743            }
744            Canonical::FixedSizeList(fsl) => {
745                let list_size = fsl.list_size();
746                let len = fsl.len();
747                let (elements, validity, _) = fsl.into_parts();
748                Ok(RecursiveCanonical(Canonical::FixedSizeList(
749                    FixedSizeListArray::new(
750                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
751                        list_size,
752                        validity.execute(ctx)?,
753                        len,
754                    ),
755                )))
756            }
757            Canonical::Struct(st) => {
758                let len = st.len();
759                let StructArrayParts {
760                    struct_fields,
761                    fields,
762                    validity,
763                } = st.into_parts();
764                let executed_fields = fields
765                    .iter()
766                    .map(|f| Ok(f.clone().execute::<RecursiveCanonical>(ctx)?.0.into_array()))
767                    .collect::<VortexResult<Arc<[_]>>>()?;
768
769                Ok(RecursiveCanonical(Canonical::Struct(unsafe {
770                    StructArray::new_unchecked(
771                        executed_fields,
772                        struct_fields,
773                        len,
774                        validity.execute(ctx)?,
775                    )
776                })))
777            }
778            Canonical::Extension(ext) => Ok(RecursiveCanonical(Canonical::Extension(
779                ExtensionArray::new(
780                    ext.ext_dtype().clone(),
781                    ext.storage_array()
782                        .clone()
783                        .execute::<RecursiveCanonical>(ctx)?
784                        .0
785                        .into_array(),
786                ),
787            ))),
788            Canonical::Variant(variant) => {
789                Ok(RecursiveCanonical(Canonical::Variant(VariantArray::new(
790                    variant
791                        .child()
792                        .clone()
793                        .execute::<RecursiveCanonical>(ctx)?
794                        .0
795                        .into_array(),
796                ))))
797            }
798        }
799    }
800}
801
802/// Execute a primitive typed array into a buffer of native values, assuming all values are valid.
803///
804/// # Errors
805///
806/// Returns a `VortexError` if the array is not all-valid (has any nulls).
807impl<T: NativePType> Executable for Buffer<T> {
808    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
809        let array = PrimitiveArray::execute(array, ctx)?;
810        vortex_ensure!(
811            array.all_valid()?,
812            "Cannot execute to native buffer: array is not all-valid."
813        );
814        Ok(array.into_buffer())
815    }
816}
817
818/// Execute the array to canonical form and unwrap as a [`PrimitiveArray`].
819///
820/// This will panic if the array's dtype is not primitive.
821impl Executable for PrimitiveArray {
822    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
823        match array.try_into::<Primitive>() {
824            Ok(primitive) => Ok(primitive),
825            Err(array) => Ok(Canonical::execute(array, ctx)?.into_primitive()),
826        }
827    }
828}
829
830/// Execute the array to canonical form and unwrap as a [`BoolArray`].
831///
832/// This will panic if the array's dtype is not bool.
833impl Executable for BoolArray {
834    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
835        match array.try_into::<Bool>() {
836            Ok(bool_array) => Ok(bool_array),
837            Err(array) => Ok(Canonical::execute(array, ctx)?.into_bool()),
838        }
839    }
840}
841
842/// Execute the array to a [`BitBuffer`], aka a non-nullable  [`BoolArray`].
843///
844/// This will panic if the array's dtype is not non-nullable bool.
845impl Executable for BitBuffer {
846    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
847        let bool = BoolArray::execute(array, ctx)?;
848        assert!(
849            !bool.dtype().is_nullable(),
850            "bit buffer execute only works with non-nullable bool arrays"
851        );
852        Ok(bool.into_bit_buffer())
853    }
854}
855
856/// Execute the array to canonical form and unwrap as a [`NullArray`].
857///
858/// This will panic if the array's dtype is not null.
859impl Executable for NullArray {
860    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
861        match array.try_into::<Null>() {
862            Ok(null_array) => Ok(null_array),
863            Err(array) => Ok(Canonical::execute(array, ctx)?.into_null()),
864        }
865    }
866}
867
868/// Execute the array to canonical form and unwrap as a [`VarBinViewArray`].
869///
870/// This will panic if the array's dtype is not utf8 or binary.
871impl Executable for VarBinViewArray {
872    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
873        match array.try_into::<VarBinView>() {
874            Ok(varbinview) => Ok(varbinview),
875            Err(array) => Ok(Canonical::execute(array, ctx)?.into_varbinview()),
876        }
877    }
878}
879
880/// Execute the array to canonical form and unwrap as an [`ExtensionArray`].
881///
882/// This will panic if the array's dtype is not an extension type.
883impl Executable for ExtensionArray {
884    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
885        match array.try_into::<Extension>() {
886            Ok(ext_array) => Ok(ext_array),
887            Err(array) => Ok(Canonical::execute(array, ctx)?.into_extension()),
888        }
889    }
890}
891
892/// Execute the array to canonical form and unwrap as a [`DecimalArray`].
893///
894/// This will panic if the array's dtype is not decimal.
895impl Executable for DecimalArray {
896    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
897        match array.try_into::<Decimal>() {
898            Ok(decimal) => Ok(decimal),
899            Err(array) => Ok(Canonical::execute(array, ctx)?.into_decimal()),
900        }
901    }
902}
903
904/// Execute the array to canonical form and unwrap as a [`ListViewArray`].
905///
906/// This will panic if the array's dtype is not list.
907impl Executable for ListViewArray {
908    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
909        match array.try_into::<ListView>() {
910            Ok(list) => Ok(list),
911            Err(array) => Ok(Canonical::execute(array, ctx)?.into_listview()),
912        }
913    }
914}
915
916/// Execute the array to canonical form and unwrap as a [`FixedSizeListArray`].
917///
918/// This will panic if the array's dtype is not fixed size list.
919impl Executable for FixedSizeListArray {
920    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
921        match array.try_into::<FixedSizeList>() {
922            Ok(fsl) => Ok(fsl),
923            Err(array) => Ok(Canonical::execute(array, ctx)?.into_fixed_size_list()),
924        }
925    }
926}
927
928/// Execute the array to canonical form and unwrap as a [`StructArray`].
929///
930/// This will panic if the array's dtype is not struct.
931impl Executable for StructArray {
932    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
933        match array.try_into::<Struct>() {
934            Ok(struct_array) => Ok(struct_array),
935            Err(array) => Ok(Canonical::execute(array, ctx)?.into_struct()),
936        }
937    }
938}
939
940/// A view into a canonical array type.
941#[derive(Debug, Clone)]
942pub enum CanonicalView<'a> {
943    Null(&'a NullArray),
944    Bool(&'a BoolArray),
945    Primitive(&'a PrimitiveArray),
946    Decimal(&'a DecimalArray),
947    VarBinView(&'a VarBinViewArray),
948    List(&'a ListViewArray),
949    FixedSizeList(&'a FixedSizeListArray),
950    Struct(&'a StructArray),
951    Extension(&'a ExtensionArray),
952    Variant(&'a VariantArray),
953}
954
955impl From<CanonicalView<'_>> for Canonical {
956    fn from(value: CanonicalView<'_>) -> Self {
957        match value {
958            CanonicalView::Null(a) => Canonical::Null(a.clone()),
959            CanonicalView::Bool(a) => Canonical::Bool(a.clone()),
960            CanonicalView::Primitive(a) => Canonical::Primitive(a.clone()),
961            CanonicalView::Decimal(a) => Canonical::Decimal(a.clone()),
962            CanonicalView::VarBinView(a) => Canonical::VarBinView(a.clone()),
963            CanonicalView::List(a) => Canonical::List(a.clone()),
964            CanonicalView::FixedSizeList(a) => Canonical::FixedSizeList(a.clone()),
965            CanonicalView::Struct(a) => Canonical::Struct(a.clone()),
966            CanonicalView::Extension(a) => Canonical::Extension(a.clone()),
967            CanonicalView::Variant(a) => Canonical::Variant(a.clone()),
968        }
969    }
970}
971
972impl AsRef<dyn DynArray> for CanonicalView<'_> {
973    fn as_ref(&self) -> &dyn DynArray {
974        match self {
975            CanonicalView::Null(a) => a.as_ref(),
976            CanonicalView::Bool(a) => a.as_ref(),
977            CanonicalView::Primitive(a) => a.as_ref(),
978            CanonicalView::Decimal(a) => a.as_ref(),
979            CanonicalView::VarBinView(a) => a.as_ref(),
980            CanonicalView::List(a) => a.as_ref(),
981            CanonicalView::FixedSizeList(a) => a.as_ref(),
982            CanonicalView::Struct(a) => a.as_ref(),
983            CanonicalView::Extension(a) => a.as_ref(),
984            CanonicalView::Variant(a) => a.as_ref(),
985        }
986    }
987}
988
989/// A matcher for any canonical array type.
990pub struct AnyCanonical;
991impl Matcher for AnyCanonical {
992    type Match<'a> = CanonicalView<'a>;
993
994    fn matches(array: &dyn DynArray) -> bool {
995        array.is::<Null>()
996            || array.is::<Bool>()
997            || array.is::<Primitive>()
998            || array.is::<Decimal>()
999            || array.is::<Struct>()
1000            || array.is::<ListView>()
1001            || array.is::<FixedSizeList>()
1002            || array.is::<VarBinView>()
1003            || array.is::<Variant>()
1004            || array.is::<Extension>()
1005    }
1006
1007    fn try_match<'a>(array: &'a dyn DynArray) -> Option<Self::Match<'a>> {
1008        if let Some(a) = array.as_opt::<Null>() {
1009            Some(CanonicalView::Null(a))
1010        } else if let Some(a) = array.as_opt::<Bool>() {
1011            Some(CanonicalView::Bool(a))
1012        } else if let Some(a) = array.as_opt::<Primitive>() {
1013            Some(CanonicalView::Primitive(a))
1014        } else if let Some(a) = array.as_opt::<Decimal>() {
1015            Some(CanonicalView::Decimal(a))
1016        } else if let Some(a) = array.as_opt::<Struct>() {
1017            Some(CanonicalView::Struct(a))
1018        } else if let Some(a) = array.as_opt::<ListView>() {
1019            Some(CanonicalView::List(a))
1020        } else if let Some(a) = array.as_opt::<FixedSizeList>() {
1021            Some(CanonicalView::FixedSizeList(a))
1022        } else if let Some(a) = array.as_opt::<VarBinView>() {
1023            Some(CanonicalView::VarBinView(a))
1024        } else if let Some(a) = array.as_opt::<Variant>() {
1025            Some(CanonicalView::Variant(a))
1026        } else {
1027            array.as_opt::<Extension>().map(CanonicalView::Extension)
1028        }
1029    }
1030}
1031
1032#[cfg(test)]
1033mod test {
1034    use std::sync::Arc;
1035
1036    use arrow_array::Array as ArrowArray;
1037    use arrow_array::ArrayRef as ArrowArrayRef;
1038    use arrow_array::ListArray as ArrowListArray;
1039    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
1040    use arrow_array::StringArray;
1041    use arrow_array::StringViewArray;
1042    use arrow_array::StructArray as ArrowStructArray;
1043    use arrow_array::cast::AsArray;
1044    use arrow_array::types::Int32Type;
1045    use arrow_array::types::Int64Type;
1046    use arrow_array::types::UInt64Type;
1047    use arrow_buffer::NullBufferBuilder;
1048    use arrow_buffer::OffsetBuffer;
1049    use arrow_schema::DataType;
1050    use arrow_schema::Field;
1051    use vortex_buffer::buffer;
1052
1053    use crate::ArrayRef;
1054    use crate::IntoArray;
1055    use crate::arrays::ConstantArray;
1056    use crate::arrow::FromArrowArray;
1057    use crate::arrow::IntoArrowArray;
1058    use crate::canonical::StructArray;
1059
1060    #[test]
1061    fn test_canonicalize_nested_struct() {
1062        // Create a struct array with multiple internal components.
1063        let nested_struct_array = StructArray::from_fields(&[
1064            ("a", buffer![1u64].into_array()),
1065            (
1066                "b",
1067                StructArray::from_fields(&[(
1068                    "inner_a",
1069                    // The nested struct contains a ConstantArray representing the primitive array
1070                    //   [100i64]
1071                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
1072                    // map this to the nearest canonical type (PrimitiveArray).
1073                    ConstantArray::new(100i64, 1).into_array(),
1074                )])
1075                .unwrap()
1076                .into_array(),
1077            ),
1078        ])
1079        .unwrap();
1080
1081        let arrow_struct = nested_struct_array
1082            .into_array()
1083            .into_arrow_preferred()
1084            .unwrap()
1085            .as_any()
1086            .downcast_ref::<ArrowStructArray>()
1087            .cloned()
1088            .unwrap();
1089
1090        assert!(
1091            arrow_struct
1092                .column(0)
1093                .as_any()
1094                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1095                .is_some()
1096        );
1097
1098        let inner_struct = arrow_struct
1099            .column(1)
1100            .clone()
1101            .as_any()
1102            .downcast_ref::<ArrowStructArray>()
1103            .cloned()
1104            .unwrap();
1105
1106        let inner_a = inner_struct
1107            .column(0)
1108            .as_any()
1109            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1110        assert!(inner_a.is_some());
1111
1112        assert_eq!(
1113            inner_a.cloned().unwrap(),
1114            ArrowPrimitiveArray::from_iter([100i64])
1115        );
1116    }
1117
1118    #[test]
1119    fn roundtrip_struct() {
1120        let mut nulls = NullBufferBuilder::new(6);
1121        nulls.append_n_non_nulls(4);
1122        nulls.append_null();
1123        nulls.append_non_null();
1124        let names = Arc::new(StringViewArray::from_iter(vec![
1125            Some("Joseph"),
1126            None,
1127            Some("Angela"),
1128            Some("Mikhail"),
1129            None,
1130            None,
1131        ]));
1132        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1133            Some(25),
1134            Some(31),
1135            None,
1136            Some(57),
1137            None,
1138            None,
1139        ]));
1140
1141        let arrow_struct = ArrowStructArray::new(
1142            vec![
1143                Arc::new(Field::new("name", DataType::Utf8View, true)),
1144                Arc::new(Field::new("age", DataType::Int32, true)),
1145            ]
1146            .into(),
1147            vec![names, ages],
1148            nulls.finish(),
1149        );
1150
1151        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true).unwrap();
1152
1153        assert_eq!(
1154            &arrow_struct,
1155            vortex_struct.into_arrow_preferred().unwrap().as_struct()
1156        );
1157    }
1158
1159    #[test]
1160    fn roundtrip_list() {
1161        let names = Arc::new(StringArray::from_iter(vec![
1162            Some("Joseph"),
1163            Some("Angela"),
1164            Some("Mikhail"),
1165        ]));
1166
1167        let arrow_list = ArrowListArray::new(
1168            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1169            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1170            names,
1171            None,
1172        );
1173        let list_data_type = arrow_list.data_type();
1174
1175        let vortex_list = ArrayRef::from_arrow(&arrow_list, true).unwrap();
1176
1177        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
1178
1179        assert_eq!(
1180            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1181            rt_arrow_list.as_ref()
1182        );
1183    }
1184}