Skip to main content

vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use std::sync::Arc;
7
8use vortex_buffer::BitBuffer;
9use vortex_buffer::Buffer;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_panic;
14
15use crate::ArrayRef;
16use crate::Executable;
17use crate::ExecutionCtx;
18use crate::IntoArray;
19use crate::array::ArrayView;
20use crate::array::child_to_validity;
21use crate::arrays::Bool;
22use crate::arrays::BoolArray;
23use crate::arrays::Decimal;
24use crate::arrays::DecimalArray;
25use crate::arrays::Extension;
26use crate::arrays::ExtensionArray;
27use crate::arrays::FixedSizeList;
28use crate::arrays::FixedSizeListArray;
29use crate::arrays::ListView;
30use crate::arrays::ListViewArray;
31use crate::arrays::Null;
32use crate::arrays::NullArray;
33use crate::arrays::Primitive;
34use crate::arrays::PrimitiveArray;
35use crate::arrays::Struct;
36use crate::arrays::StructArray;
37use crate::arrays::VarBinView;
38use crate::arrays::VarBinViewArray;
39use crate::arrays::Variant;
40use crate::arrays::VariantArray;
41use crate::arrays::bool::BoolDataParts;
42use crate::arrays::decimal::DecimalDataParts;
43use crate::arrays::extension::ExtensionArrayExt;
44use crate::arrays::fixed_size_list::FixedSizeListArrayExt;
45use crate::arrays::listview::ListViewDataParts;
46use crate::arrays::listview::ListViewRebuildMode;
47use crate::arrays::primitive::PrimitiveDataParts;
48use crate::arrays::struct_::StructDataParts;
49use crate::arrays::varbinview::VarBinViewDataParts;
50use crate::arrays::variant::VariantArrayExt;
51use crate::dtype::DType;
52use crate::dtype::NativePType;
53use crate::dtype::Nullability;
54use crate::dtype::PType;
55use crate::match_each_decimal_value_type;
56use crate::match_each_native_ptype;
57use crate::matcher::Matcher;
58use crate::validity::Validity;
59
60/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
61///
62/// Any array can be decoded into canonical form via the `to_canonical`
63/// trait method. This is the simplest encoding for a type, and will not be compressed but may
64/// contain compressed child arrays.
65///
66/// Canonical form is useful for doing type-specific compute where you need to know that all
67/// elements are laid out decompressed and contiguous in memory.
68///
69/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
70/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
71/// [`DType::Binary`].
72///
73/// # Laziness
74///
75/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
76/// `Struct` type, individual column child arrays may still be compressed. This allows
77/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
78/// never need to be decoded into canonical form at all depending on the compute.
79///
80/// # Arrow interoperability
81///
82/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
83/// zero-copy, and the corresponding Arrow array types can also be built directly.
84///
85/// The full list of canonical types and their equivalent Arrow array types are:
86///
87/// * `NullArray`: [`arrow_array::NullArray`]
88/// * `BoolArray`: [`arrow_array::BooleanArray`]
89/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
90/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
91/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
92/// * `ListViewArray`: [`arrow_array::ListViewArray`]
93/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
94/// * `StructArray`: [`arrow_array::StructArray`]
95///
96/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
97/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
98/// create ambiguity.
99/// Thus, if you receive an Arrow array, compress it using Vortex, and then
100/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
101/// variants to hold the data.
102///
103/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
104/// will correspond to an arrow-rs [`arrow_schema::DataType`].
105///
106/// # Views support
107///
108/// Binary and String views, also known as "German strings" are a better encoding format for
109/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
110/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
111/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
112/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
113/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
114/// from memory.
115///
116/// # For Developers
117///
118/// If you add another variant to this enum, make sure to update `dyn Array::is_canonical`,
119/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
120#[derive(Debug, Clone)]
121pub enum Canonical {
122    Null(NullArray),
123    Bool(BoolArray),
124    Primitive(PrimitiveArray),
125    Decimal(DecimalArray),
126    VarBinView(VarBinViewArray),
127    List(ListViewArray),
128    FixedSizeList(FixedSizeListArray),
129    Struct(StructArray),
130    Extension(ExtensionArray),
131    Variant(VariantArray),
132}
133
134/// Match on every canonical variant and evaluate a code block on all variants
135macro_rules! match_each_canonical {
136    ($self:expr, | $ident:ident | $eval:expr) => {{
137        match $self {
138            Canonical::Null($ident) => $eval,
139            Canonical::Bool($ident) => $eval,
140            Canonical::Primitive($ident) => $eval,
141            Canonical::Decimal($ident) => $eval,
142            Canonical::VarBinView($ident) => $eval,
143            Canonical::List($ident) => $eval,
144            Canonical::FixedSizeList($ident) => $eval,
145            Canonical::Struct($ident) => $eval,
146            Canonical::Variant($ident) => $eval,
147            Canonical::Extension($ident) => $eval,
148        }
149    }};
150}
151
152impl Canonical {
153    /// Create an empty canonical array of the given dtype.
154    pub fn empty(dtype: &DType) -> Canonical {
155        match dtype {
156            DType::Null => Canonical::Null(NullArray::new(0)),
157            DType::Bool(n) => Canonical::Bool(unsafe {
158                BoolArray::new_unchecked(BitBuffer::empty(), Validity::from(n))
159            }),
160            DType::Primitive(ptype, n) => {
161                match_each_native_ptype!(ptype, |P| {
162                    Canonical::Primitive(unsafe {
163                        PrimitiveArray::new_unchecked(Buffer::<P>::empty(), Validity::from(n))
164                    })
165                })
166            }
167            DType::Decimal(decimal_type, n) => {
168                match_each_decimal_value_type!(
169                    DecimalType::smallest_decimal_value_type(decimal_type),
170                    |D| {
171                        Canonical::Decimal(unsafe {
172                            DecimalArray::new_unchecked::<D>(
173                                Buffer::empty(),
174                                *decimal_type,
175                                Validity::from(n),
176                            )
177                        })
178                    }
179                )
180            }
181            DType::Utf8(n) => Canonical::VarBinView(unsafe {
182                VarBinViewArray::new_unchecked(
183                    Buffer::empty(),
184                    Arc::new([]),
185                    dtype.clone(),
186                    Validity::from(n),
187                )
188            }),
189            DType::Binary(n) => Canonical::VarBinView(unsafe {
190                VarBinViewArray::new_unchecked(
191                    Buffer::empty(),
192                    Arc::new([]),
193                    dtype.clone(),
194                    Validity::from(n),
195                )
196            }),
197            DType::Struct(struct_dtype, n) => Canonical::Struct(unsafe {
198                StructArray::new_unchecked(
199                    struct_dtype
200                        .fields()
201                        .map(|f| Canonical::empty(&f).into_array())
202                        .collect::<Arc<[_]>>(),
203                    struct_dtype.clone(),
204                    0,
205                    Validity::from(n),
206                )
207            }),
208            DType::List(dtype, n) => Canonical::List(unsafe {
209                ListViewArray::new_unchecked(
210                    Canonical::empty(dtype).into_array(),
211                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
212                        .into_array(),
213                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
214                        .into_array(),
215                    Validity::from(n),
216                )
217                // An empty list view is trivially copyable to a list.
218                .with_zero_copy_to_list(true)
219            }),
220            DType::FixedSizeList(elem_dtype, list_size, null) => Canonical::FixedSizeList(unsafe {
221                FixedSizeListArray::new_unchecked(
222                    Canonical::empty(elem_dtype).into_array(),
223                    *list_size,
224                    Validity::from(null),
225                    0,
226                )
227            }),
228            DType::Extension(ext_dtype) => Canonical::Extension(ExtensionArray::new(
229                ext_dtype.clone(),
230                Canonical::empty(ext_dtype.storage_dtype()).into_array(),
231            )),
232            DType::Variant(_) => {
233                vortex_panic!(InvalidArgument: "Canonical empty is not supported for Variant")
234            }
235        }
236    }
237
238    pub fn len(&self) -> usize {
239        match_each_canonical!(self, |arr| arr.len())
240    }
241
242    pub fn dtype(&self) -> &DType {
243        match_each_canonical!(self, |arr| arr.dtype())
244    }
245
246    pub fn is_empty(&self) -> bool {
247        match_each_canonical!(self, |arr| arr.is_empty())
248    }
249}
250
251impl Canonical {
252    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
253    ///
254    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
255    /// they can accumulate wasted space after slicing and taking operations.
256    ///
257    /// This operation is very expensive and can result in things like allocations, full-scans
258    /// and copy operations.
259    pub fn compact(&self) -> VortexResult<Canonical> {
260        match self {
261            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
262            Canonical::List(array) => Ok(Canonical::List(
263                array.rebuild(ListViewRebuildMode::TrimElements)?,
264            )),
265            _ => Ok(self.clone()),
266        }
267    }
268}
269
270// Unwrap canonical type back down to specialized type.
271impl Canonical {
272    pub fn as_null(&self) -> &NullArray {
273        if let Canonical::Null(a) = self {
274            a
275        } else {
276            vortex_panic!("Cannot get NullArray from {:?}", &self)
277        }
278    }
279
280    pub fn into_null(self) -> NullArray {
281        if let Canonical::Null(a) = self {
282            a
283        } else {
284            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
285        }
286    }
287
288    pub fn as_bool(&self) -> &BoolArray {
289        if let Canonical::Bool(a) = self {
290            a
291        } else {
292            vortex_panic!("Cannot get BoolArray from {:?}", &self)
293        }
294    }
295
296    pub fn into_bool(self) -> BoolArray {
297        if let Canonical::Bool(a) = self {
298            a
299        } else {
300            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
301        }
302    }
303
304    pub fn as_primitive(&self) -> &PrimitiveArray {
305        if let Canonical::Primitive(a) = self {
306            a
307        } else {
308            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
309        }
310    }
311
312    pub fn into_primitive(self) -> PrimitiveArray {
313        if let Canonical::Primitive(a) = self {
314            a
315        } else {
316            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
317        }
318    }
319
320    pub fn as_decimal(&self) -> &DecimalArray {
321        if let Canonical::Decimal(a) = self {
322            a
323        } else {
324            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
325        }
326    }
327
328    pub fn into_decimal(self) -> DecimalArray {
329        if let Canonical::Decimal(a) = self {
330            a
331        } else {
332            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
333        }
334    }
335
336    pub fn as_varbinview(&self) -> &VarBinViewArray {
337        if let Canonical::VarBinView(a) = self {
338            a
339        } else {
340            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
341        }
342    }
343
344    pub fn into_varbinview(self) -> VarBinViewArray {
345        if let Canonical::VarBinView(a) = self {
346            a
347        } else {
348            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
349        }
350    }
351
352    pub fn as_listview(&self) -> &ListViewArray {
353        if let Canonical::List(a) = self {
354            a
355        } else {
356            vortex_panic!("Cannot get ListArray from {:?}", &self)
357        }
358    }
359
360    pub fn into_listview(self) -> ListViewArray {
361        if let Canonical::List(a) = self {
362            a
363        } else {
364            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
365        }
366    }
367
368    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
369        if let Canonical::FixedSizeList(a) = self {
370            a
371        } else {
372            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
373        }
374    }
375
376    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
377        if let Canonical::FixedSizeList(a) = self {
378            a
379        } else {
380            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
381        }
382    }
383
384    pub fn as_struct(&self) -> &StructArray {
385        if let Canonical::Struct(a) = self {
386            a
387        } else {
388            vortex_panic!("Cannot get StructArray from {:?}", &self)
389        }
390    }
391
392    pub fn into_struct(self) -> StructArray {
393        if let Canonical::Struct(a) = self {
394            a
395        } else {
396            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
397        }
398    }
399
400    pub fn as_extension(&self) -> &ExtensionArray {
401        if let Canonical::Extension(a) = self {
402            a
403        } else {
404            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
405        }
406    }
407
408    pub fn into_extension(self) -> ExtensionArray {
409        if let Canonical::Extension(a) = self {
410            a
411        } else {
412            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
413        }
414    }
415}
416
417impl IntoArray for Canonical {
418    fn into_array(self) -> ArrayRef {
419        match_each_canonical!(self, |arr| arr.into_array())
420    }
421}
422
423/// Trait for types that can be converted from an owned type into an owned array variant.
424///
425/// # Canonicalization
426///
427/// This trait has a blanket implementation for all types implementing [ToCanonical].
428pub trait ToCanonical {
429    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
430    fn to_null(&self) -> NullArray;
431
432    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
433    fn to_bool(&self) -> BoolArray;
434
435    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
436    /// typed.
437    fn to_primitive(&self) -> PrimitiveArray;
438
439    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
440    /// typed.
441    fn to_decimal(&self) -> DecimalArray;
442
443    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
444    fn to_struct(&self) -> StructArray;
445
446    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
447    fn to_listview(&self) -> ListViewArray;
448
449    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
450    /// typed.
451    fn to_fixed_size_list(&self) -> FixedSizeListArray;
452
453    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
454    /// or [`Binary`](DType::Binary) typed.
455    fn to_varbinview(&self) -> VarBinViewArray;
456
457    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
458    /// typed.
459    fn to_extension(&self) -> ExtensionArray;
460}
461
462// Blanket impl for all Array encodings.
463impl ToCanonical for ArrayRef {
464    fn to_null(&self) -> NullArray {
465        self.to_canonical()
466            .vortex_expect("to_canonical failed")
467            .into_null()
468    }
469
470    fn to_bool(&self) -> BoolArray {
471        self.to_canonical()
472            .vortex_expect("to_canonical failed")
473            .into_bool()
474    }
475
476    fn to_primitive(&self) -> PrimitiveArray {
477        self.to_canonical()
478            .vortex_expect("to_canonical failed")
479            .into_primitive()
480    }
481
482    fn to_decimal(&self) -> DecimalArray {
483        self.to_canonical()
484            .vortex_expect("to_canonical failed")
485            .into_decimal()
486    }
487
488    fn to_struct(&self) -> StructArray {
489        self.to_canonical()
490            .vortex_expect("to_canonical failed")
491            .into_struct()
492    }
493
494    fn to_listview(&self) -> ListViewArray {
495        self.to_canonical()
496            .vortex_expect("to_canonical failed")
497            .into_listview()
498    }
499
500    fn to_fixed_size_list(&self) -> FixedSizeListArray {
501        self.to_canonical()
502            .vortex_expect("to_canonical failed")
503            .into_fixed_size_list()
504    }
505
506    fn to_varbinview(&self) -> VarBinViewArray {
507        self.to_canonical()
508            .vortex_expect("to_canonical failed")
509            .into_varbinview()
510    }
511
512    fn to_extension(&self) -> ExtensionArray {
513        self.to_canonical()
514            .vortex_expect("to_canonical failed")
515            .into_extension()
516    }
517}
518
519impl From<Canonical> for ArrayRef {
520    fn from(value: Canonical) -> Self {
521        match_each_canonical!(value, |arr| arr.into_array())
522    }
523}
524
525/// Execute into [`Canonical`] by running `execute_until` with the [`AnyCanonical`] matcher.
526///
527/// Unlike executing into [`crate::Columnar`], this will fully expand constant arrays into their
528/// canonical form. Callers should prefer to execute into `Columnar` if they are able to optimize
529/// their use for constant arrays.
530impl Executable for Canonical {
531    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
532        let result = array.execute_until::<AnyCanonical>(ctx)?;
533        Ok(result
534            .as_opt::<AnyCanonical>()
535            .map(Canonical::from)
536            .vortex_expect("execute_until::<AnyCanonical> must return a canonical array"))
537    }
538}
539
540/// Recursively execute the array until it reaches canonical form along with its validity.
541///
542/// Callers should prefer to execute into `Columnar` instead of this specific target.
543/// This target is useful when preparing arrays for writing.
544pub struct CanonicalValidity(pub Canonical);
545
546impl Executable for CanonicalValidity {
547    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
548        match array.execute::<Canonical>(ctx)? {
549            n @ Canonical::Null(_) => Ok(CanonicalValidity(n)),
550            Canonical::Bool(b) => {
551                let validity = child_to_validity(&b.slots()[0], b.dtype().nullability());
552                let len = b.len();
553                let BoolDataParts { bits, offset, len } = b.into_data().into_parts(len);
554                Ok(CanonicalValidity(Canonical::Bool(
555                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
556                )))
557            }
558            Canonical::Primitive(p) => {
559                let PrimitiveDataParts {
560                    ptype,
561                    buffer,
562                    validity,
563                } = p.into_data_parts();
564                Ok(CanonicalValidity(Canonical::Primitive(unsafe {
565                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
566                })))
567            }
568            Canonical::Decimal(d) => {
569                let DecimalDataParts {
570                    decimal_dtype,
571                    values,
572                    values_type,
573                    validity,
574                } = d.into_data_parts();
575                Ok(CanonicalValidity(Canonical::Decimal(unsafe {
576                    DecimalArray::new_unchecked_handle(
577                        values,
578                        values_type,
579                        decimal_dtype,
580                        validity.execute(ctx)?,
581                    )
582                })))
583            }
584            Canonical::VarBinView(vbv) => {
585                let VarBinViewDataParts {
586                    dtype,
587                    buffers,
588                    views,
589                    validity,
590                } = vbv.into_data_parts();
591                Ok(CanonicalValidity(Canonical::VarBinView(unsafe {
592                    VarBinViewArray::new_handle_unchecked(
593                        views,
594                        buffers,
595                        dtype,
596                        validity.execute(ctx)?,
597                    )
598                })))
599            }
600            Canonical::List(l) => {
601                let zctl = l.is_zero_copy_to_list();
602                let ListViewDataParts {
603                    elements,
604                    offsets,
605                    sizes,
606                    validity,
607                    ..
608                } = l.into_data_parts();
609                Ok(CanonicalValidity(Canonical::List(unsafe {
610                    ListViewArray::new_unchecked(elements, offsets, sizes, validity.execute(ctx)?)
611                        .with_zero_copy_to_list(zctl)
612                })))
613            }
614            Canonical::FixedSizeList(fsl) => {
615                let list_size = fsl.list_size();
616                let len = fsl.len();
617                let parts = fsl.into_data_parts();
618                let elements = parts.elements;
619                let validity = parts.validity;
620                Ok(CanonicalValidity(Canonical::FixedSizeList(
621                    FixedSizeListArray::new(elements, list_size, validity.execute(ctx)?, len),
622                )))
623            }
624            Canonical::Struct(st) => {
625                let len = st.len();
626                let StructDataParts {
627                    struct_fields,
628                    fields,
629                    validity,
630                } = st.into_data_parts();
631                Ok(CanonicalValidity(Canonical::Struct(unsafe {
632                    StructArray::new_unchecked(fields, struct_fields, len, validity.execute(ctx)?)
633                })))
634            }
635            Canonical::Extension(ext) => Ok(CanonicalValidity(Canonical::Extension(
636                ExtensionArray::new(
637                    ext.ext_dtype().clone(),
638                    ext.storage_array()
639                        .clone()
640                        .execute::<CanonicalValidity>(ctx)?
641                        .0
642                        .into_array(),
643                ),
644            ))),
645            Canonical::Variant(variant) => {
646                Ok(CanonicalValidity(Canonical::Variant(VariantArray::new(
647                    variant
648                        .child()
649                        .clone()
650                        .execute::<CanonicalValidity>(ctx)?
651                        .0
652                        .into_array(),
653                ))))
654            }
655        }
656    }
657}
658
659/// Recursively execute the array until all of its children are canonical.
660///
661/// This method is useful to guarantee that all operators are fully executed,
662/// callers should prefer an execution target that's suitable for their use case instead of this one.
663pub struct RecursiveCanonical(pub Canonical);
664
665impl Executable for RecursiveCanonical {
666    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
667        match array.execute::<Canonical>(ctx)? {
668            n @ Canonical::Null(_) => Ok(RecursiveCanonical(n)),
669            Canonical::Bool(b) => {
670                let validity = child_to_validity(&b.slots()[0], b.dtype().nullability());
671                let len = b.len();
672                let BoolDataParts { bits, offset, len } = b.into_data().into_parts(len);
673                Ok(RecursiveCanonical(Canonical::Bool(
674                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
675                )))
676            }
677            Canonical::Primitive(p) => {
678                let PrimitiveDataParts {
679                    ptype,
680                    buffer,
681                    validity,
682                } = p.into_data_parts();
683                Ok(RecursiveCanonical(Canonical::Primitive(unsafe {
684                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
685                })))
686            }
687            Canonical::Decimal(d) => {
688                let DecimalDataParts {
689                    decimal_dtype,
690                    values,
691                    values_type,
692                    validity,
693                } = d.into_data_parts();
694                Ok(RecursiveCanonical(Canonical::Decimal(unsafe {
695                    DecimalArray::new_unchecked_handle(
696                        values,
697                        values_type,
698                        decimal_dtype,
699                        validity.execute(ctx)?,
700                    )
701                })))
702            }
703            Canonical::VarBinView(vbv) => {
704                let VarBinViewDataParts {
705                    dtype,
706                    buffers,
707                    views,
708                    validity,
709                } = vbv.into_data_parts();
710                Ok(RecursiveCanonical(Canonical::VarBinView(unsafe {
711                    VarBinViewArray::new_handle_unchecked(
712                        views,
713                        buffers,
714                        dtype,
715                        validity.execute(ctx)?,
716                    )
717                })))
718            }
719            Canonical::List(l) => {
720                let zctl = l.is_zero_copy_to_list();
721                let ListViewDataParts {
722                    elements,
723                    offsets,
724                    sizes,
725                    validity,
726                    ..
727                } = l.into_data_parts();
728                Ok(RecursiveCanonical(Canonical::List(unsafe {
729                    ListViewArray::new_unchecked(
730                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
731                        offsets.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
732                        sizes.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
733                        validity.execute(ctx)?,
734                    )
735                    .with_zero_copy_to_list(zctl)
736                })))
737            }
738            Canonical::FixedSizeList(fsl) => {
739                let list_size = fsl.list_size();
740                let len = fsl.len();
741                let parts = fsl.into_data_parts();
742                let elements = parts.elements;
743                let validity = parts.validity;
744                Ok(RecursiveCanonical(Canonical::FixedSizeList(
745                    FixedSizeListArray::new(
746                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
747                        list_size,
748                        validity.execute(ctx)?,
749                        len,
750                    ),
751                )))
752            }
753            Canonical::Struct(st) => {
754                let len = st.len();
755                let StructDataParts {
756                    struct_fields,
757                    fields,
758                    validity,
759                } = st.into_data_parts();
760                let executed_fields = fields
761                    .iter()
762                    .map(|f| Ok(f.clone().execute::<RecursiveCanonical>(ctx)?.0.into_array()))
763                    .collect::<VortexResult<Arc<[_]>>>()?;
764
765                Ok(RecursiveCanonical(Canonical::Struct(unsafe {
766                    StructArray::new_unchecked(
767                        executed_fields,
768                        struct_fields,
769                        len,
770                        validity.execute(ctx)?,
771                    )
772                })))
773            }
774            Canonical::Extension(ext) => Ok(RecursiveCanonical(Canonical::Extension(
775                ExtensionArray::new(
776                    ext.ext_dtype().clone(),
777                    ext.storage_array()
778                        .clone()
779                        .execute::<RecursiveCanonical>(ctx)?
780                        .0
781                        .into_array(),
782                ),
783            ))),
784            Canonical::Variant(variant) => {
785                Ok(RecursiveCanonical(Canonical::Variant(VariantArray::new(
786                    variant
787                        .child()
788                        .clone()
789                        .execute::<RecursiveCanonical>(ctx)?
790                        .0
791                        .into_array(),
792                ))))
793            }
794        }
795    }
796}
797
798/// Execute a primitive typed array into a buffer of native values, assuming all values are valid.
799///
800/// # Errors
801///
802/// Returns a `VortexError` if the array is not all-valid (has any nulls).
803impl<T: NativePType> Executable for Buffer<T> {
804    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
805        let array = PrimitiveArray::execute(array, ctx)?;
806        vortex_ensure!(
807            matches!(
808                array.validity()?,
809                Validity::NonNullable | Validity::AllValid
810            ),
811            "Cannot execute to native buffer: array is not all-valid."
812        );
813        Ok(array.into_buffer())
814    }
815}
816
817/// Execute the array to canonical form and unwrap as a [`PrimitiveArray`].
818///
819/// This will panic if the array's dtype is not primitive.
820impl Executable for PrimitiveArray {
821    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
822        match array.try_downcast::<Primitive>() {
823            Ok(primitive) => Ok(primitive),
824            Err(array) => Ok(Canonical::execute(array, ctx)?.into_primitive()),
825        }
826    }
827}
828
829/// Execute the array to canonical form and unwrap as a [`BoolArray`].
830///
831/// This will panic if the array's dtype is not bool.
832impl Executable for BoolArray {
833    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
834        match array.try_downcast::<Bool>() {
835            Ok(bool_array) => Ok(bool_array),
836            Err(array) => Ok(Canonical::execute(array, ctx)?.into_bool()),
837        }
838    }
839}
840
841/// Execute the array to a [`BitBuffer`], aka a non-nullable  [`BoolArray`].
842///
843/// This will panic if the array's dtype is not non-nullable bool.
844impl Executable for BitBuffer {
845    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
846        let bool = BoolArray::execute(array, ctx)?;
847        assert!(
848            !bool.dtype().is_nullable(),
849            "bit buffer execute only works with non-nullable bool arrays"
850        );
851        Ok(bool.into_bit_buffer())
852    }
853}
854
855/// Execute the array to canonical form and unwrap as a [`NullArray`].
856///
857/// This will panic if the array's dtype is not null.
858impl Executable for NullArray {
859    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
860        match array.try_downcast::<Null>() {
861            Ok(null_array) => Ok(null_array),
862            Err(array) => Ok(Canonical::execute(array, ctx)?.into_null()),
863        }
864    }
865}
866
867/// Execute the array to canonical form and unwrap as a [`VarBinViewArray`].
868///
869/// This will panic if the array's dtype is not utf8 or binary.
870impl Executable for VarBinViewArray {
871    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
872        match array.try_downcast::<VarBinView>() {
873            Ok(varbinview) => Ok(varbinview),
874            Err(array) => Ok(Canonical::execute(array, ctx)?.into_varbinview()),
875        }
876    }
877}
878
879/// Execute the array to canonical form and unwrap as an [`ExtensionArray`].
880///
881/// This will panic if the array's dtype is not an extension type.
882impl Executable for ExtensionArray {
883    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
884        match array.try_downcast::<Extension>() {
885            Ok(ext_array) => Ok(ext_array),
886            Err(array) => Ok(Canonical::execute(array, ctx)?.into_extension()),
887        }
888    }
889}
890
891/// Execute the array to canonical form and unwrap as a [`DecimalArray`].
892///
893/// This will panic if the array's dtype is not decimal.
894impl Executable for DecimalArray {
895    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
896        match array.try_downcast::<Decimal>() {
897            Ok(decimal) => Ok(decimal),
898            Err(array) => Ok(Canonical::execute(array, ctx)?.into_decimal()),
899        }
900    }
901}
902
903/// Execute the array to canonical form and unwrap as a [`ListViewArray`].
904///
905/// This will panic if the array's dtype is not list.
906impl Executable for ListViewArray {
907    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
908        match array.try_downcast::<ListView>() {
909            Ok(list) => Ok(list),
910            Err(array) => Ok(Canonical::execute(array, ctx)?.into_listview()),
911        }
912    }
913}
914
915/// Execute the array to canonical form and unwrap as a [`FixedSizeListArray`].
916///
917/// This will panic if the array's dtype is not fixed size list.
918impl Executable for FixedSizeListArray {
919    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
920        match array.try_downcast::<FixedSizeList>() {
921            Ok(fsl) => Ok(fsl),
922            Err(array) => Ok(Canonical::execute(array, ctx)?.into_fixed_size_list()),
923        }
924    }
925}
926
927/// Execute the array to canonical form and unwrap as a [`StructArray`].
928///
929/// This will panic if the array's dtype is not struct.
930impl Executable for StructArray {
931    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
932        match array.try_downcast::<Struct>() {
933            Ok(struct_array) => Ok(struct_array),
934            Err(array) => Ok(Canonical::execute(array, ctx)?.into_struct()),
935        }
936    }
937}
938
939/// A view into a canonical array type.
940///
941/// Uses `ArrayView<V>` because these are obtained by
942/// downcasting through the `Matcher` trait which returns `ArrayView<V>`.
943#[derive(Debug, Clone, Copy)]
944pub enum CanonicalView<'a> {
945    Null(ArrayView<'a, Null>),
946    Bool(ArrayView<'a, Bool>),
947    Primitive(ArrayView<'a, Primitive>),
948    Decimal(ArrayView<'a, Decimal>),
949    VarBinView(ArrayView<'a, VarBinView>),
950    List(ArrayView<'a, ListView>),
951    FixedSizeList(ArrayView<'a, FixedSizeList>),
952    Struct(ArrayView<'a, Struct>),
953    Extension(ArrayView<'a, Extension>),
954    Variant(ArrayView<'a, Variant>),
955}
956
957impl From<CanonicalView<'_>> for Canonical {
958    fn from(value: CanonicalView<'_>) -> Self {
959        match value {
960            CanonicalView::Null(a) => Canonical::Null(a.into_owned()),
961            CanonicalView::Bool(a) => Canonical::Bool(a.into_owned()),
962            CanonicalView::Primitive(a) => Canonical::Primitive(a.into_owned()),
963            CanonicalView::Decimal(a) => Canonical::Decimal(a.into_owned()),
964            CanonicalView::VarBinView(a) => Canonical::VarBinView(a.into_owned()),
965            CanonicalView::List(a) => Canonical::List(a.into_owned()),
966            CanonicalView::FixedSizeList(a) => Canonical::FixedSizeList(a.into_owned()),
967            CanonicalView::Struct(a) => Canonical::Struct(a.into_owned()),
968            CanonicalView::Extension(a) => Canonical::Extension(a.into_owned()),
969            CanonicalView::Variant(a) => Canonical::Variant(a.into_owned()),
970        }
971    }
972}
973
974impl CanonicalView<'_> {
975    /// Convert to a type-erased [`ArrayRef`].
976    pub fn to_array_ref(&self) -> ArrayRef {
977        match self {
978            CanonicalView::Null(a) => a.array().clone(),
979            CanonicalView::Bool(a) => a.array().clone(),
980            CanonicalView::Primitive(a) => a.array().clone(),
981            CanonicalView::Decimal(a) => a.array().clone(),
982            CanonicalView::VarBinView(a) => a.array().clone(),
983            CanonicalView::List(a) => a.array().clone(),
984            CanonicalView::FixedSizeList(a) => a.array().clone(),
985            CanonicalView::Struct(a) => a.array().clone(),
986            CanonicalView::Extension(a) => a.array().clone(),
987            CanonicalView::Variant(a) => a.array().clone(),
988        }
989    }
990}
991
992/// A matcher for any canonical array type.
993pub struct AnyCanonical;
994impl Matcher for AnyCanonical {
995    type Match<'a> = CanonicalView<'a>;
996
997    fn matches(array: &ArrayRef) -> bool {
998        array.is::<Null>()
999            || array.is::<Bool>()
1000            || array.is::<Primitive>()
1001            || array.is::<Decimal>()
1002            || array.is::<Struct>()
1003            || array.is::<ListView>()
1004            || array.is::<FixedSizeList>()
1005            || array.is::<VarBinView>()
1006            || array.is::<Variant>()
1007            || array.is::<Extension>()
1008            || array.is::<Variant>()
1009    }
1010
1011    fn try_match<'a>(array: &'a ArrayRef) -> Option<Self::Match<'a>> {
1012        if let Some(a) = array.as_opt::<Null>() {
1013            Some(CanonicalView::Null(a))
1014        } else if let Some(a) = array.as_opt::<Bool>() {
1015            Some(CanonicalView::Bool(a))
1016        } else if let Some(a) = array.as_opt::<Primitive>() {
1017            Some(CanonicalView::Primitive(a))
1018        } else if let Some(a) = array.as_opt::<Decimal>() {
1019            Some(CanonicalView::Decimal(a))
1020        } else if let Some(a) = array.as_opt::<Struct>() {
1021            Some(CanonicalView::Struct(a))
1022        } else if let Some(a) = array.as_opt::<ListView>() {
1023            Some(CanonicalView::List(a))
1024        } else if let Some(a) = array.as_opt::<FixedSizeList>() {
1025            Some(CanonicalView::FixedSizeList(a))
1026        } else if let Some(a) = array.as_opt::<VarBinView>() {
1027            Some(CanonicalView::VarBinView(a))
1028        } else if let Some(a) = array.as_opt::<Variant>() {
1029            Some(CanonicalView::Variant(a))
1030        } else {
1031            array.as_opt::<Extension>().map(CanonicalView::Extension)
1032        }
1033    }
1034}
1035
1036#[cfg(test)]
1037mod test {
1038    use std::sync::Arc;
1039
1040    use arrow_array::Array as ArrowArray;
1041    use arrow_array::ArrayRef as ArrowArrayRef;
1042    use arrow_array::ListArray as ArrowListArray;
1043    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
1044    use arrow_array::StringArray;
1045    use arrow_array::StringViewArray;
1046    use arrow_array::StructArray as ArrowStructArray;
1047    use arrow_array::cast::AsArray;
1048    use arrow_array::types::Int32Type;
1049    use arrow_array::types::Int64Type;
1050    use arrow_array::types::UInt64Type;
1051    use arrow_buffer::NullBufferBuilder;
1052    use arrow_buffer::OffsetBuffer;
1053    use arrow_schema::DataType;
1054    use arrow_schema::Field;
1055    use vortex_buffer::buffer;
1056
1057    use crate::ArrayRef;
1058    use crate::IntoArray;
1059    use crate::arrays::ConstantArray;
1060    use crate::arrow::FromArrowArray;
1061    use crate::arrow::IntoArrowArray;
1062    use crate::canonical::StructArray;
1063
1064    #[test]
1065    fn test_canonicalize_nested_struct() {
1066        // Create a struct array with multiple internal components.
1067        let nested_struct_array = StructArray::from_fields(&[
1068            ("a", buffer![1u64].into_array()),
1069            (
1070                "b",
1071                StructArray::from_fields(&[(
1072                    "inner_a",
1073                    // The nested struct contains a ConstantArray representing the primitive array
1074                    //   [100i64]
1075                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
1076                    // map this to the nearest canonical type (PrimitiveArray).
1077                    ConstantArray::new(100i64, 1).into_array(),
1078                )])
1079                .unwrap()
1080                .into_array(),
1081            ),
1082        ])
1083        .unwrap();
1084
1085        let arrow_struct = nested_struct_array
1086            .into_array()
1087            .into_arrow_preferred()
1088            .unwrap()
1089            .as_any()
1090            .downcast_ref::<ArrowStructArray>()
1091            .cloned()
1092            .unwrap();
1093
1094        assert!(
1095            arrow_struct
1096                .column(0)
1097                .as_any()
1098                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1099                .is_some()
1100        );
1101
1102        let inner_struct = Arc::clone(arrow_struct.column(1))
1103            .as_any()
1104            .downcast_ref::<ArrowStructArray>()
1105            .cloned()
1106            .unwrap();
1107
1108        let inner_a = inner_struct
1109            .column(0)
1110            .as_any()
1111            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1112        assert!(inner_a.is_some());
1113
1114        assert_eq!(
1115            inner_a.cloned().unwrap(),
1116            ArrowPrimitiveArray::from_iter([100i64])
1117        );
1118    }
1119
1120    #[test]
1121    fn roundtrip_struct() {
1122        let mut nulls = NullBufferBuilder::new(6);
1123        nulls.append_n_non_nulls(4);
1124        nulls.append_null();
1125        nulls.append_non_null();
1126        let names = Arc::new(StringViewArray::from_iter(vec![
1127            Some("Joseph"),
1128            None,
1129            Some("Angela"),
1130            Some("Mikhail"),
1131            None,
1132            None,
1133        ]));
1134        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1135            Some(25),
1136            Some(31),
1137            None,
1138            Some(57),
1139            None,
1140            None,
1141        ]));
1142
1143        let arrow_struct = ArrowStructArray::new(
1144            vec![
1145                Arc::new(Field::new("name", DataType::Utf8View, true)),
1146                Arc::new(Field::new("age", DataType::Int32, true)),
1147            ]
1148            .into(),
1149            vec![names, ages],
1150            nulls.finish(),
1151        );
1152
1153        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true).unwrap();
1154
1155        assert_eq!(
1156            &arrow_struct,
1157            vortex_struct.into_arrow_preferred().unwrap().as_struct()
1158        );
1159    }
1160
1161    #[test]
1162    fn roundtrip_list() {
1163        let names = Arc::new(StringArray::from_iter(vec![
1164            Some("Joseph"),
1165            Some("Angela"),
1166            Some("Mikhail"),
1167        ]));
1168
1169        let arrow_list = ArrowListArray::new(
1170            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1171            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1172            names,
1173            None,
1174        );
1175        let list_data_type = arrow_list.data_type();
1176
1177        let vortex_list = ArrayRef::from_arrow(&arrow_list, true).unwrap();
1178
1179        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
1180
1181        assert_eq!(
1182            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1183            rt_arrow_list.as_ref()
1184        );
1185    }
1186}