Skip to main content

vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use std::sync::Arc;
7
8use vortex_buffer::BitBuffer;
9use vortex_buffer::Buffer;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_panic;
14
15use crate::ArrayRef;
16use crate::Executable;
17use crate::ExecutionCtx;
18use crate::IntoArray;
19use crate::array::ArrayView;
20use crate::array::child_to_validity;
21use crate::arrays::Bool;
22use crate::arrays::BoolArray;
23use crate::arrays::Decimal;
24use crate::arrays::DecimalArray;
25use crate::arrays::Extension;
26use crate::arrays::ExtensionArray;
27use crate::arrays::FixedSizeList;
28use crate::arrays::FixedSizeListArray;
29use crate::arrays::ListView;
30use crate::arrays::ListViewArray;
31use crate::arrays::Null;
32use crate::arrays::NullArray;
33use crate::arrays::Primitive;
34use crate::arrays::PrimitiveArray;
35use crate::arrays::Struct;
36use crate::arrays::StructArray;
37use crate::arrays::VarBinView;
38use crate::arrays::VarBinViewArray;
39use crate::arrays::Variant;
40use crate::arrays::VariantArray;
41use crate::arrays::bool::BoolDataParts;
42use crate::arrays::decimal::DecimalDataParts;
43use crate::arrays::extension::ExtensionArrayExt;
44use crate::arrays::fixed_size_list::FixedSizeListArrayExt;
45use crate::arrays::listview::ListViewDataParts;
46use crate::arrays::listview::ListViewRebuildMode;
47use crate::arrays::primitive::PrimitiveDataParts;
48use crate::arrays::struct_::StructDataParts;
49use crate::arrays::varbinview::VarBinViewDataParts;
50use crate::arrays::variant::VariantArrayExt;
51use crate::dtype::DType;
52use crate::dtype::NativePType;
53use crate::dtype::Nullability;
54use crate::dtype::PType;
55use crate::match_each_decimal_value_type;
56use crate::match_each_native_ptype;
57use crate::matcher::Matcher;
58use crate::validity::Validity;
59
60/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
61///
62/// Any array can be decoded into canonical form via the `to_canonical`
63/// trait method. This is the simplest encoding for a type, and will not be compressed but may
64/// contain compressed child arrays.
65///
66/// Canonical form is useful for doing type-specific compute where you need to know that all
67/// elements are laid out decompressed and contiguous in memory.
68///
69/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
70/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
71/// [`DType::Binary`].
72///
73/// # Laziness
74///
75/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
76/// `Struct` type, individual column child arrays may still be compressed. This allows
77/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
78/// never need to be decoded into canonical form at all depending on the compute.
79///
80/// # Arrow interoperability
81///
82/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
83/// zero-copy, and the corresponding Arrow array types can also be built directly.
84///
85/// The full list of canonical types and their equivalent Arrow array types are:
86///
87/// * `NullArray`: [`arrow_array::NullArray`]
88/// * `BoolArray`: [`arrow_array::BooleanArray`]
89/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
90/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
91/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
92/// * `ListViewArray`: [`arrow_array::ListViewArray`]
93/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
94/// * `StructArray`: [`arrow_array::StructArray`]
95///
96/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
97/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
98/// create ambiguity.
99/// Thus, if you receive an Arrow array, compress it using Vortex, and then
100/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
101/// variants to hold the data.
102///
103/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
104/// will correspond to an arrow-rs [`arrow_schema::DataType`].
105///
106/// # Views support
107///
108/// Binary and String views, also known as "German strings" are a better encoding format for
109/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
110/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
111/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
112/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
113/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
114/// from memory.
115///
116/// # For Developers
117///
118/// If you add another variant to this enum, make sure to update `dyn Array::is_canonical`,
119/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
120#[derive(Debug, Clone)]
121pub enum Canonical {
122    Null(NullArray),
123    Bool(BoolArray),
124    Primitive(PrimitiveArray),
125    Decimal(DecimalArray),
126    VarBinView(VarBinViewArray),
127    List(ListViewArray),
128    FixedSizeList(FixedSizeListArray),
129    Struct(StructArray),
130    Extension(ExtensionArray),
131    Variant(VariantArray),
132}
133
134/// Match on every canonical variant and evaluate a code block on all variants
135macro_rules! match_each_canonical {
136    ($self:expr, | $ident:ident | $eval:expr) => {{
137        match $self {
138            Canonical::Null($ident) => $eval,
139            Canonical::Bool($ident) => $eval,
140            Canonical::Primitive($ident) => $eval,
141            Canonical::Decimal($ident) => $eval,
142            Canonical::VarBinView($ident) => $eval,
143            Canonical::List($ident) => $eval,
144            Canonical::FixedSizeList($ident) => $eval,
145            Canonical::Struct($ident) => $eval,
146            Canonical::Variant($ident) => $eval,
147            Canonical::Extension($ident) => $eval,
148        }
149    }};
150}
151
152impl Canonical {
153    /// Create an empty canonical array of the given dtype.
154    pub fn empty(dtype: &DType) -> Canonical {
155        match dtype {
156            DType::Null => Canonical::Null(NullArray::new(0)),
157            DType::Bool(n) => Canonical::Bool(unsafe {
158                BoolArray::new_unchecked(BitBuffer::empty(), Validity::from(n))
159            }),
160            DType::Primitive(ptype, n) => {
161                match_each_native_ptype!(ptype, |P| {
162                    Canonical::Primitive(unsafe {
163                        PrimitiveArray::new_unchecked(Buffer::<P>::empty(), Validity::from(n))
164                    })
165                })
166            }
167            DType::Decimal(decimal_type, n) => {
168                match_each_decimal_value_type!(
169                    DecimalType::smallest_decimal_value_type(decimal_type),
170                    |D| {
171                        Canonical::Decimal(unsafe {
172                            DecimalArray::new_unchecked::<D>(
173                                Buffer::empty(),
174                                *decimal_type,
175                                Validity::from(n),
176                            )
177                        })
178                    }
179                )
180            }
181            DType::Utf8(n) => Canonical::VarBinView(unsafe {
182                VarBinViewArray::new_unchecked(
183                    Buffer::empty(),
184                    Arc::new([]),
185                    dtype.clone(),
186                    Validity::from(n),
187                )
188            }),
189            DType::Binary(n) => Canonical::VarBinView(unsafe {
190                VarBinViewArray::new_unchecked(
191                    Buffer::empty(),
192                    Arc::new([]),
193                    dtype.clone(),
194                    Validity::from(n),
195                )
196            }),
197            DType::Struct(struct_dtype, n) => Canonical::Struct(unsafe {
198                StructArray::new_unchecked(
199                    struct_dtype
200                        .fields()
201                        .map(|f| Canonical::empty(&f).into_array())
202                        .collect::<Arc<[_]>>(),
203                    struct_dtype.clone(),
204                    0,
205                    Validity::from(n),
206                )
207            }),
208            DType::List(dtype, n) => Canonical::List(unsafe {
209                ListViewArray::new_unchecked(
210                    Canonical::empty(dtype).into_array(),
211                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
212                        .into_array(),
213                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
214                        .into_array(),
215                    Validity::from(n),
216                )
217                // An empty list view is trivially copyable to a list.
218                .with_zero_copy_to_list(true)
219            }),
220            DType::FixedSizeList(elem_dtype, list_size, null) => Canonical::FixedSizeList(unsafe {
221                FixedSizeListArray::new_unchecked(
222                    Canonical::empty(elem_dtype).into_array(),
223                    *list_size,
224                    Validity::from(null),
225                    0,
226                )
227            }),
228            DType::Extension(ext_dtype) => Canonical::Extension(ExtensionArray::new(
229                ext_dtype.clone(),
230                Canonical::empty(ext_dtype.storage_dtype()).into_array(),
231            )),
232            DType::Variant(_) => {
233                vortex_panic!(InvalidArgument: "Canonical empty is not supported for Variant")
234            }
235        }
236    }
237
238    pub fn len(&self) -> usize {
239        match_each_canonical!(self, |arr| arr.len())
240    }
241
242    pub fn dtype(&self) -> &DType {
243        match_each_canonical!(self, |arr| arr.dtype())
244    }
245
246    pub fn is_empty(&self) -> bool {
247        match_each_canonical!(self, |arr| arr.is_empty())
248    }
249}
250
251impl Canonical {
252    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
253    ///
254    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
255    /// they can accumulate wasted space after slicing and taking operations.
256    ///
257    /// This operation is very expensive and can result in things like allocations, full-scans
258    /// and copy operations.
259    pub fn compact(&self) -> VortexResult<Canonical> {
260        match self {
261            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
262            Canonical::List(array) => Ok(Canonical::List(
263                array.rebuild(ListViewRebuildMode::TrimElements)?,
264            )),
265            _ => Ok(self.clone()),
266        }
267    }
268}
269
270// Unwrap canonical type back down to specialized type.
271impl Canonical {
272    pub fn as_null(&self) -> &NullArray {
273        if let Canonical::Null(a) = self {
274            a
275        } else {
276            vortex_panic!("Cannot get NullArray from {:?}", &self)
277        }
278    }
279
280    pub fn into_null(self) -> NullArray {
281        if let Canonical::Null(a) = self {
282            a
283        } else {
284            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
285        }
286    }
287
288    pub fn as_bool(&self) -> &BoolArray {
289        if let Canonical::Bool(a) = self {
290            a
291        } else {
292            vortex_panic!("Cannot get BoolArray from {:?}", &self)
293        }
294    }
295
296    pub fn into_bool(self) -> BoolArray {
297        if let Canonical::Bool(a) = self {
298            a
299        } else {
300            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
301        }
302    }
303
304    pub fn as_primitive(&self) -> &PrimitiveArray {
305        if let Canonical::Primitive(a) = self {
306            a
307        } else {
308            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
309        }
310    }
311
312    pub fn into_primitive(self) -> PrimitiveArray {
313        if let Canonical::Primitive(a) = self {
314            a
315        } else {
316            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
317        }
318    }
319
320    pub fn as_decimal(&self) -> &DecimalArray {
321        if let Canonical::Decimal(a) = self {
322            a
323        } else {
324            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
325        }
326    }
327
328    pub fn into_decimal(self) -> DecimalArray {
329        if let Canonical::Decimal(a) = self {
330            a
331        } else {
332            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
333        }
334    }
335
336    pub fn as_varbinview(&self) -> &VarBinViewArray {
337        if let Canonical::VarBinView(a) = self {
338            a
339        } else {
340            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
341        }
342    }
343
344    pub fn into_varbinview(self) -> VarBinViewArray {
345        if let Canonical::VarBinView(a) = self {
346            a
347        } else {
348            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
349        }
350    }
351
352    pub fn as_listview(&self) -> &ListViewArray {
353        if let Canonical::List(a) = self {
354            a
355        } else {
356            vortex_panic!("Cannot get ListArray from {:?}", &self)
357        }
358    }
359
360    pub fn into_listview(self) -> ListViewArray {
361        if let Canonical::List(a) = self {
362            a
363        } else {
364            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
365        }
366    }
367
368    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
369        if let Canonical::FixedSizeList(a) = self {
370            a
371        } else {
372            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
373        }
374    }
375
376    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
377        if let Canonical::FixedSizeList(a) = self {
378            a
379        } else {
380            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
381        }
382    }
383
384    pub fn as_struct(&self) -> &StructArray {
385        if let Canonical::Struct(a) = self {
386            a
387        } else {
388            vortex_panic!("Cannot get StructArray from {:?}", &self)
389        }
390    }
391
392    pub fn into_struct(self) -> StructArray {
393        if let Canonical::Struct(a) = self {
394            a
395        } else {
396            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
397        }
398    }
399
400    pub fn as_extension(&self) -> &ExtensionArray {
401        if let Canonical::Extension(a) = self {
402            a
403        } else {
404            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
405        }
406    }
407
408    pub fn into_extension(self) -> ExtensionArray {
409        if let Canonical::Extension(a) = self {
410            a
411        } else {
412            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
413        }
414    }
415}
416
417impl IntoArray for Canonical {
418    fn into_array(self) -> ArrayRef {
419        match_each_canonical!(self, |arr| arr.into_array())
420    }
421}
422
423/// Trait for types that can be converted from an owned type into an owned array variant.
424///
425/// # Canonicalization
426///
427/// This trait has a blanket implementation for all types implementing [ToCanonical].
428#[deprecated(note = "use `array.execute::<T>(ctx)` instead")]
429pub trait ToCanonical {
430    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
431    #[deprecated(note = "use `array.execute::<NullArray>(ctx)` instead")]
432    fn to_null(&self) -> NullArray;
433
434    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
435    #[deprecated(note = "use `array.execute::<BoolArray>(ctx)` instead")]
436    fn to_bool(&self) -> BoolArray;
437
438    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
439    /// typed.
440    #[deprecated(note = "use `array.execute::<PrimitiveArray>(ctx)` instead")]
441    fn to_primitive(&self) -> PrimitiveArray;
442
443    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
444    /// typed.
445    #[deprecated(note = "use `array.execute::<DecimalArray>(ctx)` instead")]
446    fn to_decimal(&self) -> DecimalArray;
447
448    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
449    #[deprecated(note = "use `array.execute::<StructArray>(ctx)` instead")]
450    fn to_struct(&self) -> StructArray;
451
452    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
453    #[deprecated(note = "use `array.execute::<ListViewArray>(ctx)` instead")]
454    fn to_listview(&self) -> ListViewArray;
455
456    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
457    /// typed.
458    #[deprecated(note = "use `array.execute::<FixedSizeListArray>(ctx)` instead")]
459    fn to_fixed_size_list(&self) -> FixedSizeListArray;
460
461    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
462    /// or [`Binary`](DType::Binary) typed.
463    #[deprecated(note = "use `array.execute::<VarBinViewArray>(ctx)` instead")]
464    fn to_varbinview(&self) -> VarBinViewArray;
465
466    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
467    /// typed.
468    #[deprecated(note = "use `array.execute::<ExtensionArray>(ctx)` instead")]
469    fn to_extension(&self) -> ExtensionArray;
470}
471
472// Blanket impl for all Array encodings.
473#[expect(deprecated)]
474impl ToCanonical for ArrayRef {
475    fn to_null(&self) -> NullArray {
476        #[expect(deprecated)]
477        let result = self.to_canonical().vortex_expect("to_canonical failed");
478        result.into_null()
479    }
480
481    fn to_bool(&self) -> BoolArray {
482        #[expect(deprecated)]
483        let result = self.to_canonical().vortex_expect("to_canonical failed");
484        result.into_bool()
485    }
486
487    fn to_primitive(&self) -> PrimitiveArray {
488        #[expect(deprecated)]
489        let result = self.to_canonical().vortex_expect("to_canonical failed");
490        result.into_primitive()
491    }
492
493    fn to_decimal(&self) -> DecimalArray {
494        #[expect(deprecated)]
495        let result = self.to_canonical().vortex_expect("to_canonical failed");
496        result.into_decimal()
497    }
498
499    fn to_struct(&self) -> StructArray {
500        #[expect(deprecated)]
501        let result = self.to_canonical().vortex_expect("to_canonical failed");
502        result.into_struct()
503    }
504
505    fn to_listview(&self) -> ListViewArray {
506        #[expect(deprecated)]
507        let result = self.to_canonical().vortex_expect("to_canonical failed");
508        result.into_listview()
509    }
510
511    fn to_fixed_size_list(&self) -> FixedSizeListArray {
512        #[expect(deprecated)]
513        let result = self.to_canonical().vortex_expect("to_canonical failed");
514        result.into_fixed_size_list()
515    }
516
517    fn to_varbinview(&self) -> VarBinViewArray {
518        #[expect(deprecated)]
519        let result = self.to_canonical().vortex_expect("to_canonical failed");
520        result.into_varbinview()
521    }
522
523    fn to_extension(&self) -> ExtensionArray {
524        #[expect(deprecated)]
525        let result = self.to_canonical().vortex_expect("to_canonical failed");
526        result.into_extension()
527    }
528}
529
530impl From<Canonical> for ArrayRef {
531    fn from(value: Canonical) -> Self {
532        match_each_canonical!(value, |arr| arr.into_array())
533    }
534}
535
536/// Execute into [`Canonical`] by running `execute_until` with the [`AnyCanonical`] matcher.
537///
538/// Unlike executing into [`crate::Columnar`], this will fully expand constant arrays into their
539/// canonical form. Callers should prefer to execute into `Columnar` if they are able to optimize
540/// their use for constant arrays.
541impl Executable for Canonical {
542    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
543        let result = array.execute_until::<AnyCanonical>(ctx)?;
544        Ok(result
545            .as_opt::<AnyCanonical>()
546            .map(Canonical::from)
547            .vortex_expect("execute_until::<AnyCanonical> must return a canonical array"))
548    }
549}
550
551/// Recursively execute the array until it reaches canonical form along with its validity.
552///
553/// Callers should prefer to execute into `Columnar` instead of this specific target.
554/// This target is useful when preparing arrays for writing.
555pub struct CanonicalValidity(pub Canonical);
556
557impl Executable for CanonicalValidity {
558    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
559        match array.execute::<Canonical>(ctx)? {
560            n @ Canonical::Null(_) => Ok(CanonicalValidity(n)),
561            Canonical::Bool(b) => {
562                let validity = child_to_validity(b.slots()[0].as_ref(), b.dtype().nullability());
563                let len = b.len();
564                let BoolDataParts { bits, offset, len } = b.into_data().into_parts(len);
565                Ok(CanonicalValidity(Canonical::Bool(
566                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
567                )))
568            }
569            Canonical::Primitive(p) => {
570                let PrimitiveDataParts {
571                    ptype,
572                    buffer,
573                    validity,
574                } = p.into_data_parts();
575                Ok(CanonicalValidity(Canonical::Primitive(unsafe {
576                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
577                })))
578            }
579            Canonical::Decimal(d) => {
580                let DecimalDataParts {
581                    decimal_dtype,
582                    values,
583                    values_type,
584                    validity,
585                } = d.into_data_parts();
586                Ok(CanonicalValidity(Canonical::Decimal(unsafe {
587                    DecimalArray::new_unchecked_handle(
588                        values,
589                        values_type,
590                        decimal_dtype,
591                        validity.execute(ctx)?,
592                    )
593                })))
594            }
595            Canonical::VarBinView(vbv) => {
596                let VarBinViewDataParts {
597                    dtype,
598                    buffers,
599                    views,
600                    validity,
601                } = vbv.into_data_parts();
602                Ok(CanonicalValidity(Canonical::VarBinView(unsafe {
603                    VarBinViewArray::new_handle_unchecked(
604                        views,
605                        buffers,
606                        dtype,
607                        validity.execute(ctx)?,
608                    )
609                })))
610            }
611            Canonical::List(l) => {
612                let zctl = l.is_zero_copy_to_list();
613                let ListViewDataParts {
614                    elements,
615                    offsets,
616                    sizes,
617                    validity,
618                    ..
619                } = l.into_data_parts();
620                Ok(CanonicalValidity(Canonical::List(unsafe {
621                    ListViewArray::new_unchecked(elements, offsets, sizes, validity.execute(ctx)?)
622                        .with_zero_copy_to_list(zctl)
623                })))
624            }
625            Canonical::FixedSizeList(fsl) => {
626                let list_size = fsl.list_size();
627                let len = fsl.len();
628                let parts = fsl.into_data_parts();
629                let elements = parts.elements;
630                let validity = parts.validity;
631                Ok(CanonicalValidity(Canonical::FixedSizeList(
632                    FixedSizeListArray::new(elements, list_size, validity.execute(ctx)?, len),
633                )))
634            }
635            Canonical::Struct(st) => {
636                let len = st.len();
637                let StructDataParts {
638                    struct_fields,
639                    fields,
640                    validity,
641                } = st.into_data_parts();
642                Ok(CanonicalValidity(Canonical::Struct(unsafe {
643                    StructArray::new_unchecked(fields, struct_fields, len, validity.execute(ctx)?)
644                })))
645            }
646            Canonical::Extension(ext) => Ok(CanonicalValidity(Canonical::Extension(
647                ExtensionArray::new(
648                    ext.ext_dtype().clone(),
649                    ext.storage_array()
650                        .clone()
651                        .execute::<CanonicalValidity>(ctx)?
652                        .0
653                        .into_array(),
654                ),
655            ))),
656            Canonical::Variant(variant) => {
657                Ok(CanonicalValidity(Canonical::Variant(VariantArray::new(
658                    variant
659                        .child()
660                        .clone()
661                        .execute::<CanonicalValidity>(ctx)?
662                        .0
663                        .into_array(),
664                ))))
665            }
666        }
667    }
668}
669
670/// Recursively execute the array until all of its children are canonical.
671///
672/// This method is useful to guarantee that all operators are fully executed,
673/// callers should prefer an execution target that's suitable for their use case instead of this one.
674pub struct RecursiveCanonical(pub Canonical);
675
676impl Executable for RecursiveCanonical {
677    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
678        match array.execute::<Canonical>(ctx)? {
679            n @ Canonical::Null(_) => Ok(RecursiveCanonical(n)),
680            Canonical::Bool(b) => {
681                let validity = child_to_validity(b.slots()[0].as_ref(), b.dtype().nullability());
682                let len = b.len();
683                let BoolDataParts { bits, offset, len } = b.into_data().into_parts(len);
684                Ok(RecursiveCanonical(Canonical::Bool(
685                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
686                )))
687            }
688            Canonical::Primitive(p) => {
689                let PrimitiveDataParts {
690                    ptype,
691                    buffer,
692                    validity,
693                } = p.into_data_parts();
694                Ok(RecursiveCanonical(Canonical::Primitive(unsafe {
695                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
696                })))
697            }
698            Canonical::Decimal(d) => {
699                let DecimalDataParts {
700                    decimal_dtype,
701                    values,
702                    values_type,
703                    validity,
704                } = d.into_data_parts();
705                Ok(RecursiveCanonical(Canonical::Decimal(unsafe {
706                    DecimalArray::new_unchecked_handle(
707                        values,
708                        values_type,
709                        decimal_dtype,
710                        validity.execute(ctx)?,
711                    )
712                })))
713            }
714            Canonical::VarBinView(vbv) => {
715                let VarBinViewDataParts {
716                    dtype,
717                    buffers,
718                    views,
719                    validity,
720                } = vbv.into_data_parts();
721                Ok(RecursiveCanonical(Canonical::VarBinView(unsafe {
722                    VarBinViewArray::new_handle_unchecked(
723                        views,
724                        buffers,
725                        dtype,
726                        validity.execute(ctx)?,
727                    )
728                })))
729            }
730            Canonical::List(l) => {
731                let zctl = l.is_zero_copy_to_list();
732                let ListViewDataParts {
733                    elements,
734                    offsets,
735                    sizes,
736                    validity,
737                    ..
738                } = l.into_data_parts();
739                Ok(RecursiveCanonical(Canonical::List(unsafe {
740                    ListViewArray::new_unchecked(
741                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
742                        offsets.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
743                        sizes.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
744                        validity.execute(ctx)?,
745                    )
746                    .with_zero_copy_to_list(zctl)
747                })))
748            }
749            Canonical::FixedSizeList(fsl) => {
750                let list_size = fsl.list_size();
751                let len = fsl.len();
752                let parts = fsl.into_data_parts();
753                let elements = parts.elements;
754                let validity = parts.validity;
755                Ok(RecursiveCanonical(Canonical::FixedSizeList(
756                    FixedSizeListArray::new(
757                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
758                        list_size,
759                        validity.execute(ctx)?,
760                        len,
761                    ),
762                )))
763            }
764            Canonical::Struct(st) => {
765                let len = st.len();
766                let StructDataParts {
767                    struct_fields,
768                    fields,
769                    validity,
770                } = st.into_data_parts();
771                let executed_fields = fields
772                    .iter()
773                    .map(|f| Ok(f.clone().execute::<RecursiveCanonical>(ctx)?.0.into_array()))
774                    .collect::<VortexResult<Arc<[_]>>>()?;
775
776                Ok(RecursiveCanonical(Canonical::Struct(unsafe {
777                    StructArray::new_unchecked(
778                        executed_fields,
779                        struct_fields,
780                        len,
781                        validity.execute(ctx)?,
782                    )
783                })))
784            }
785            Canonical::Extension(ext) => Ok(RecursiveCanonical(Canonical::Extension(
786                ExtensionArray::new(
787                    ext.ext_dtype().clone(),
788                    ext.storage_array()
789                        .clone()
790                        .execute::<RecursiveCanonical>(ctx)?
791                        .0
792                        .into_array(),
793                ),
794            ))),
795            Canonical::Variant(variant) => {
796                Ok(RecursiveCanonical(Canonical::Variant(VariantArray::new(
797                    variant
798                        .child()
799                        .clone()
800                        .execute::<RecursiveCanonical>(ctx)?
801                        .0
802                        .into_array(),
803                ))))
804            }
805        }
806    }
807}
808
809/// Execute a primitive typed array into a buffer of native values, assuming all values are valid.
810///
811/// # Errors
812///
813/// Returns a `VortexError` if the array is not all-valid (has any nulls).
814impl<T: NativePType> Executable for Buffer<T> {
815    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
816        let array = PrimitiveArray::execute(array, ctx)?;
817        vortex_ensure!(
818            matches!(
819                array.validity()?,
820                Validity::NonNullable | Validity::AllValid
821            ),
822            "Cannot execute to native buffer: array is not all-valid."
823        );
824        Ok(array.into_buffer())
825    }
826}
827
828/// Execute the array to canonical form and unwrap as a [`PrimitiveArray`].
829///
830/// This will panic if the array's dtype is not primitive.
831impl Executable for PrimitiveArray {
832    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
833        match array.try_downcast::<Primitive>() {
834            Ok(primitive) => Ok(primitive),
835            Err(array) => Ok(Canonical::execute(array, ctx)?.into_primitive()),
836        }
837    }
838}
839
840/// Execute the array to canonical form and unwrap as a [`BoolArray`].
841///
842/// This will panic if the array's dtype is not bool.
843impl Executable for BoolArray {
844    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
845        match array.try_downcast::<Bool>() {
846            Ok(bool_array) => Ok(bool_array),
847            Err(array) => Ok(Canonical::execute(array, ctx)?.into_bool()),
848        }
849    }
850}
851
852/// Execute the array to a [`BitBuffer`], aka a non-nullable  [`BoolArray`].
853///
854/// This will panic if the array's dtype is not non-nullable bool.
855impl Executable for BitBuffer {
856    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
857        let bool = BoolArray::execute(array, ctx)?;
858        assert!(
859            !bool.dtype().is_nullable(),
860            "bit buffer execute only works with non-nullable bool arrays"
861        );
862        Ok(bool.into_bit_buffer())
863    }
864}
865
866/// Execute the array to canonical form and unwrap as a [`NullArray`].
867///
868/// This will panic if the array's dtype is not null.
869impl Executable for NullArray {
870    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
871        match array.try_downcast::<Null>() {
872            Ok(null_array) => Ok(null_array),
873            Err(array) => Ok(Canonical::execute(array, ctx)?.into_null()),
874        }
875    }
876}
877
878/// Execute the array to canonical form and unwrap as a [`VarBinViewArray`].
879///
880/// This will panic if the array's dtype is not utf8 or binary.
881impl Executable for VarBinViewArray {
882    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
883        match array.try_downcast::<VarBinView>() {
884            Ok(varbinview) => Ok(varbinview),
885            Err(array) => Ok(Canonical::execute(array, ctx)?.into_varbinview()),
886        }
887    }
888}
889
890/// Execute the array to canonical form and unwrap as an [`ExtensionArray`].
891///
892/// This will panic if the array's dtype is not an extension type.
893impl Executable for ExtensionArray {
894    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
895        match array.try_downcast::<Extension>() {
896            Ok(ext_array) => Ok(ext_array),
897            Err(array) => Ok(Canonical::execute(array, ctx)?.into_extension()),
898        }
899    }
900}
901
902/// Execute the array to canonical form and unwrap as a [`DecimalArray`].
903///
904/// This will panic if the array's dtype is not decimal.
905impl Executable for DecimalArray {
906    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
907        match array.try_downcast::<Decimal>() {
908            Ok(decimal) => Ok(decimal),
909            Err(array) => Ok(Canonical::execute(array, ctx)?.into_decimal()),
910        }
911    }
912}
913
914/// Execute the array to canonical form and unwrap as a [`ListViewArray`].
915///
916/// This will panic if the array's dtype is not list.
917impl Executable for ListViewArray {
918    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
919        match array.try_downcast::<ListView>() {
920            Ok(list) => Ok(list),
921            Err(array) => Ok(Canonical::execute(array, ctx)?.into_listview()),
922        }
923    }
924}
925
926/// Execute the array to canonical form and unwrap as a [`FixedSizeListArray`].
927///
928/// This will panic if the array's dtype is not fixed size list.
929impl Executable for FixedSizeListArray {
930    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
931        match array.try_downcast::<FixedSizeList>() {
932            Ok(fsl) => Ok(fsl),
933            Err(array) => Ok(Canonical::execute(array, ctx)?.into_fixed_size_list()),
934        }
935    }
936}
937
938/// Execute the array to canonical form and unwrap as a [`StructArray`].
939///
940/// This will panic if the array's dtype is not struct.
941impl Executable for StructArray {
942    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
943        match array.try_downcast::<Struct>() {
944            Ok(struct_array) => Ok(struct_array),
945            Err(array) => Ok(Canonical::execute(array, ctx)?.into_struct()),
946        }
947    }
948}
949
950/// A view into a canonical array type.
951///
952/// Uses `ArrayView<V>` because these are obtained by
953/// downcasting through the `Matcher` trait which returns `ArrayView<V>`.
954#[derive(Debug, Clone, Copy)]
955pub enum CanonicalView<'a> {
956    Null(ArrayView<'a, Null>),
957    Bool(ArrayView<'a, Bool>),
958    Primitive(ArrayView<'a, Primitive>),
959    Decimal(ArrayView<'a, Decimal>),
960    VarBinView(ArrayView<'a, VarBinView>),
961    List(ArrayView<'a, ListView>),
962    FixedSizeList(ArrayView<'a, FixedSizeList>),
963    Struct(ArrayView<'a, Struct>),
964    Extension(ArrayView<'a, Extension>),
965    Variant(ArrayView<'a, Variant>),
966}
967
968impl From<CanonicalView<'_>> for Canonical {
969    fn from(value: CanonicalView<'_>) -> Self {
970        match value {
971            CanonicalView::Null(a) => Canonical::Null(a.into_owned()),
972            CanonicalView::Bool(a) => Canonical::Bool(a.into_owned()),
973            CanonicalView::Primitive(a) => Canonical::Primitive(a.into_owned()),
974            CanonicalView::Decimal(a) => Canonical::Decimal(a.into_owned()),
975            CanonicalView::VarBinView(a) => Canonical::VarBinView(a.into_owned()),
976            CanonicalView::List(a) => Canonical::List(a.into_owned()),
977            CanonicalView::FixedSizeList(a) => Canonical::FixedSizeList(a.into_owned()),
978            CanonicalView::Struct(a) => Canonical::Struct(a.into_owned()),
979            CanonicalView::Extension(a) => Canonical::Extension(a.into_owned()),
980            CanonicalView::Variant(a) => Canonical::Variant(a.into_owned()),
981        }
982    }
983}
984
985impl CanonicalView<'_> {
986    /// Convert to a type-erased [`ArrayRef`].
987    pub fn to_array_ref(&self) -> ArrayRef {
988        match self {
989            CanonicalView::Null(a) => a.array().clone(),
990            CanonicalView::Bool(a) => a.array().clone(),
991            CanonicalView::Primitive(a) => a.array().clone(),
992            CanonicalView::Decimal(a) => a.array().clone(),
993            CanonicalView::VarBinView(a) => a.array().clone(),
994            CanonicalView::List(a) => a.array().clone(),
995            CanonicalView::FixedSizeList(a) => a.array().clone(),
996            CanonicalView::Struct(a) => a.array().clone(),
997            CanonicalView::Extension(a) => a.array().clone(),
998            CanonicalView::Variant(a) => a.array().clone(),
999        }
1000    }
1001}
1002
1003/// A matcher for any canonical array type.
1004pub struct AnyCanonical;
1005impl Matcher for AnyCanonical {
1006    type Match<'a> = CanonicalView<'a>;
1007
1008    fn matches(array: &ArrayRef) -> bool {
1009        array.is::<Null>()
1010            || array.is::<Bool>()
1011            || array.is::<Primitive>()
1012            || array.is::<Decimal>()
1013            || array.is::<Struct>()
1014            || array.is::<ListView>()
1015            || array.is::<FixedSizeList>()
1016            || array.is::<VarBinView>()
1017            || array.is::<Variant>()
1018            || array.is::<Extension>()
1019            || array.is::<Variant>()
1020    }
1021
1022    fn try_match<'a>(array: &'a ArrayRef) -> Option<Self::Match<'a>> {
1023        if let Some(a) = array.as_opt::<Null>() {
1024            Some(CanonicalView::Null(a))
1025        } else if let Some(a) = array.as_opt::<Bool>() {
1026            Some(CanonicalView::Bool(a))
1027        } else if let Some(a) = array.as_opt::<Primitive>() {
1028            Some(CanonicalView::Primitive(a))
1029        } else if let Some(a) = array.as_opt::<Decimal>() {
1030            Some(CanonicalView::Decimal(a))
1031        } else if let Some(a) = array.as_opt::<Struct>() {
1032            Some(CanonicalView::Struct(a))
1033        } else if let Some(a) = array.as_opt::<ListView>() {
1034            Some(CanonicalView::List(a))
1035        } else if let Some(a) = array.as_opt::<FixedSizeList>() {
1036            Some(CanonicalView::FixedSizeList(a))
1037        } else if let Some(a) = array.as_opt::<VarBinView>() {
1038            Some(CanonicalView::VarBinView(a))
1039        } else if let Some(a) = array.as_opt::<Variant>() {
1040            Some(CanonicalView::Variant(a))
1041        } else {
1042            array.as_opt::<Extension>().map(CanonicalView::Extension)
1043        }
1044    }
1045}
1046
1047#[cfg(test)]
1048mod test {
1049    use std::sync::Arc;
1050
1051    use arrow_array::Array as ArrowArray;
1052    use arrow_array::ArrayRef as ArrowArrayRef;
1053    use arrow_array::ListArray as ArrowListArray;
1054    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
1055    use arrow_array::StringArray;
1056    use arrow_array::StringViewArray;
1057    use arrow_array::StructArray as ArrowStructArray;
1058    use arrow_array::cast::AsArray;
1059    use arrow_array::types::Int32Type;
1060    use arrow_array::types::Int64Type;
1061    use arrow_array::types::UInt64Type;
1062    use arrow_buffer::NullBufferBuilder;
1063    use arrow_buffer::OffsetBuffer;
1064    use arrow_schema::DataType;
1065    use arrow_schema::Field;
1066    use vortex_buffer::buffer;
1067
1068    use crate::ArrayRef;
1069    use crate::IntoArray;
1070    use crate::LEGACY_SESSION;
1071    use crate::VortexSessionExecute;
1072    use crate::arrays::ConstantArray;
1073    use crate::arrow::ArrowArrayExecutor;
1074    use crate::arrow::FromArrowArray;
1075    use crate::canonical::StructArray;
1076
1077    #[test]
1078    fn test_canonicalize_nested_struct() {
1079        let mut ctx = LEGACY_SESSION.create_execution_ctx();
1080        // Create a struct array with multiple internal components.
1081        let nested_struct_array = StructArray::from_fields(&[
1082            ("a", buffer![1u64].into_array()),
1083            (
1084                "b",
1085                StructArray::from_fields(&[(
1086                    "inner_a",
1087                    // The nested struct contains a ConstantArray representing the primitive array
1088                    //   [100i64]
1089                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
1090                    // map this to the nearest canonical type (PrimitiveArray).
1091                    ConstantArray::new(100i64, 1).into_array(),
1092                )])
1093                .unwrap()
1094                .into_array(),
1095            ),
1096        ])
1097        .unwrap();
1098
1099        let arrow_struct = nested_struct_array
1100            .into_array()
1101            .execute_arrow(None, &mut ctx)
1102            .unwrap()
1103            .as_any()
1104            .downcast_ref::<ArrowStructArray>()
1105            .cloned()
1106            .unwrap();
1107
1108        assert!(
1109            arrow_struct
1110                .column(0)
1111                .as_any()
1112                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1113                .is_some()
1114        );
1115
1116        let inner_struct = Arc::clone(arrow_struct.column(1))
1117            .as_any()
1118            .downcast_ref::<ArrowStructArray>()
1119            .cloned()
1120            .unwrap();
1121
1122        let inner_a = inner_struct
1123            .column(0)
1124            .as_any()
1125            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1126        assert!(inner_a.is_some());
1127
1128        assert_eq!(
1129            inner_a.cloned().unwrap(),
1130            ArrowPrimitiveArray::from_iter([100i64])
1131        );
1132    }
1133
1134    #[test]
1135    fn roundtrip_struct() {
1136        let mut ctx = LEGACY_SESSION.create_execution_ctx();
1137        let mut nulls = NullBufferBuilder::new(6);
1138        nulls.append_n_non_nulls(4);
1139        nulls.append_null();
1140        nulls.append_non_null();
1141        let names = Arc::new(StringViewArray::from_iter(vec![
1142            Some("Joseph"),
1143            None,
1144            Some("Angela"),
1145            Some("Mikhail"),
1146            None,
1147            None,
1148        ]));
1149        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1150            Some(25),
1151            Some(31),
1152            None,
1153            Some(57),
1154            None,
1155            None,
1156        ]));
1157
1158        let arrow_struct = ArrowStructArray::new(
1159            vec![
1160                Arc::new(Field::new("name", DataType::Utf8View, true)),
1161                Arc::new(Field::new("age", DataType::Int32, true)),
1162            ]
1163            .into(),
1164            vec![names, ages],
1165            nulls.finish(),
1166        );
1167
1168        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true).unwrap();
1169
1170        assert_eq!(
1171            &arrow_struct,
1172            vortex_struct
1173                .execute_arrow(None, &mut ctx)
1174                .unwrap()
1175                .as_struct()
1176        );
1177    }
1178
1179    #[test]
1180    fn roundtrip_list() {
1181        let mut ctx = LEGACY_SESSION.create_execution_ctx();
1182        let names = Arc::new(StringArray::from_iter(vec![
1183            Some("Joseph"),
1184            Some("Angela"),
1185            Some("Mikhail"),
1186        ]));
1187
1188        let arrow_list = ArrowListArray::new(
1189            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1190            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1191            names,
1192            None,
1193        );
1194        let list_data_type = arrow_list.data_type();
1195
1196        let vortex_list = ArrayRef::from_arrow(&arrow_list, true).unwrap();
1197
1198        let rt_arrow_list = vortex_list
1199            .execute_arrow(Some(list_data_type), &mut ctx)
1200            .unwrap();
1201
1202        assert_eq!(
1203            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1204            rt_arrow_list.as_ref()
1205        );
1206    }
1207}