Skip to main content

vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use std::sync::Arc;
7
8use vortex_buffer::BitBuffer;
9use vortex_buffer::Buffer;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_panic;
14
15use crate::ArrayRef;
16use crate::ArraySlots;
17use crate::Executable;
18use crate::ExecutionCtx;
19use crate::IntoArray;
20use crate::array::ArrayView;
21use crate::array::child_to_validity;
22use crate::arrays::Bool;
23use crate::arrays::BoolArray;
24use crate::arrays::Decimal;
25use crate::arrays::DecimalArray;
26use crate::arrays::Extension;
27use crate::arrays::ExtensionArray;
28use crate::arrays::FixedSizeList;
29use crate::arrays::FixedSizeListArray;
30use crate::arrays::ListView;
31use crate::arrays::ListViewArray;
32use crate::arrays::Null;
33use crate::arrays::NullArray;
34use crate::arrays::Primitive;
35use crate::arrays::PrimitiveArray;
36use crate::arrays::Struct;
37use crate::arrays::StructArray;
38use crate::arrays::VarBinView;
39use crate::arrays::VarBinViewArray;
40use crate::arrays::Variant;
41use crate::arrays::VariantArray;
42use crate::arrays::bool::BoolDataParts;
43use crate::arrays::decimal::DecimalDataParts;
44use crate::arrays::extension::ExtensionArrayExt;
45use crate::arrays::fixed_size_list::FixedSizeListArrayExt;
46use crate::arrays::listview::ListViewDataParts;
47use crate::arrays::listview::ListViewRebuildMode;
48use crate::arrays::primitive::PrimitiveDataParts;
49use crate::arrays::struct_::StructDataParts;
50use crate::arrays::varbinview::VarBinViewDataParts;
51use crate::arrays::variant::VariantArrayExt;
52use crate::dtype::DType;
53use crate::dtype::NativePType;
54use crate::dtype::Nullability;
55use crate::dtype::PType;
56use crate::match_each_decimal_value_type;
57use crate::match_each_native_ptype;
58use crate::matcher::Matcher;
59use crate::validity::Validity;
60
61/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
62///
63/// Any array can be decoded into canonical form via the `to_canonical`
64/// trait method. This is the simplest encoding for a type, and will not be compressed but may
65/// contain compressed child arrays.
66///
67/// Canonical form is useful for doing type-specific compute where you need to know that all
68/// elements are laid out decompressed and contiguous in memory.
69///
70/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
71/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
72/// [`DType::Binary`].
73///
74/// # Laziness
75///
76/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
77/// `Struct` type, individual column child arrays may still be compressed. This allows
78/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
79/// never need to be decoded into canonical form at all depending on the compute.
80///
81/// # Arrow interoperability
82///
83/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
84/// zero-copy, and the corresponding Arrow array types can also be built directly.
85///
86/// The full list of canonical types and their equivalent Arrow array types are:
87///
88/// * `NullArray`: [`arrow_array::NullArray`]
89/// * `BoolArray`: [`arrow_array::BooleanArray`]
90/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
91/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
92/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
93/// * `ListViewArray`: [`arrow_array::ListViewArray`]
94/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
95/// * `StructArray`: [`arrow_array::StructArray`]
96///
97/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
98/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
99/// create ambiguity.
100/// Thus, if you receive an Arrow array, compress it using Vortex, and then
101/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
102/// variants to hold the data.
103///
104/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
105/// will correspond to an arrow-rs [`arrow_schema::DataType`].
106///
107/// # Views support
108///
109/// Binary and String views, also known as "German strings" are a better encoding format for
110/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
111/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
112/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
113/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
114/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
115/// from memory.
116///
117/// # For Developers
118///
119/// If you add another variant to this enum, make sure to update `dyn Array::is_canonical`,
120/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
121#[derive(Debug, Clone)]
122pub enum Canonical {
123    Null(NullArray),
124    Bool(BoolArray),
125    Primitive(PrimitiveArray),
126    Decimal(DecimalArray),
127    VarBinView(VarBinViewArray),
128    List(ListViewArray),
129    FixedSizeList(FixedSizeListArray),
130    Struct(StructArray),
131    Extension(ExtensionArray),
132    Variant(VariantArray),
133}
134
135/// Match on every canonical variant and evaluate a code block on all variants
136macro_rules! match_each_canonical {
137    ($self:expr, | $ident:ident | $eval:expr) => {{
138        match $self {
139            Canonical::Null($ident) => $eval,
140            Canonical::Bool($ident) => $eval,
141            Canonical::Primitive($ident) => $eval,
142            Canonical::Decimal($ident) => $eval,
143            Canonical::VarBinView($ident) => $eval,
144            Canonical::List($ident) => $eval,
145            Canonical::FixedSizeList($ident) => $eval,
146            Canonical::Struct($ident) => $eval,
147            Canonical::Variant($ident) => $eval,
148            Canonical::Extension($ident) => $eval,
149        }
150    }};
151}
152
153impl Canonical {
154    /// Create an empty canonical array of the given dtype.
155    pub fn empty(dtype: &DType) -> Canonical {
156        match dtype {
157            DType::Null => Canonical::Null(NullArray::new(0)),
158            DType::Bool(n) => Canonical::Bool(unsafe {
159                BoolArray::new_unchecked(BitBuffer::empty(), Validity::from(n))
160            }),
161            DType::Primitive(ptype, n) => {
162                match_each_native_ptype!(ptype, |P| {
163                    Canonical::Primitive(unsafe {
164                        PrimitiveArray::new_unchecked(Buffer::<P>::empty(), Validity::from(n))
165                    })
166                })
167            }
168            DType::Decimal(decimal_type, n) => {
169                match_each_decimal_value_type!(
170                    DecimalType::smallest_decimal_value_type(decimal_type),
171                    |D| {
172                        Canonical::Decimal(unsafe {
173                            DecimalArray::new_unchecked::<D>(
174                                Buffer::empty(),
175                                *decimal_type,
176                                Validity::from(n),
177                            )
178                        })
179                    }
180                )
181            }
182            DType::Utf8(n) => Canonical::VarBinView(unsafe {
183                VarBinViewArray::new_unchecked(
184                    Buffer::empty(),
185                    Arc::new([]),
186                    dtype.clone(),
187                    Validity::from(n),
188                )
189            }),
190            DType::Binary(n) => Canonical::VarBinView(unsafe {
191                VarBinViewArray::new_unchecked(
192                    Buffer::empty(),
193                    Arc::new([]),
194                    dtype.clone(),
195                    Validity::from(n),
196                )
197            }),
198            DType::List(dtype, n) => Canonical::List(unsafe {
199                ListViewArray::new_unchecked(
200                    Canonical::empty(dtype).into_array(),
201                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
202                        .into_array(),
203                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
204                        .into_array(),
205                    Validity::from(n),
206                )
207                // An empty list view is trivially copyable to a list.
208                .with_zero_copy_to_list(true)
209            }),
210            DType::FixedSizeList(elem_dtype, list_size, null) => Canonical::FixedSizeList(unsafe {
211                FixedSizeListArray::new_unchecked(
212                    Canonical::empty(elem_dtype).into_array(),
213                    *list_size,
214                    Validity::from(null),
215                    0,
216                )
217            }),
218            DType::Struct(struct_dtype, n) => Canonical::Struct(unsafe {
219                StructArray::new_unchecked(
220                    struct_dtype
221                        .fields()
222                        .map(|f| Canonical::empty(&f).into_array())
223                        .collect::<Arc<[_]>>(),
224                    struct_dtype.clone(),
225                    0,
226                    Validity::from(n),
227                )
228            }),
229            DType::Union(..) => todo!("TODO(connor)[Union]: unimplemented"),
230            DType::Variant(_) => {
231                vortex_panic!(InvalidArgument: "Canonical empty is not supported for Variant")
232            }
233            DType::Extension(ext_dtype) => Canonical::Extension(ExtensionArray::new(
234                ext_dtype.clone(),
235                Canonical::empty(ext_dtype.storage_dtype()).into_array(),
236            )),
237        }
238    }
239
240    pub fn len(&self) -> usize {
241        match_each_canonical!(self, |arr| arr.len())
242    }
243
244    pub fn dtype(&self) -> &DType {
245        match_each_canonical!(self, |arr| arr.dtype())
246    }
247
248    pub fn is_empty(&self) -> bool {
249        match_each_canonical!(self, |arr| arr.is_empty())
250    }
251}
252
253impl Canonical {
254    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
255    ///
256    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
257    /// they can accumulate wasted space after slicing and taking operations.
258    ///
259    /// This operation is very expensive and can result in things like allocations, full-scans
260    /// and copy operations.
261    pub fn compact(&self) -> VortexResult<Canonical> {
262        match self {
263            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
264            Canonical::List(array) => Ok(Canonical::List(
265                array.rebuild(ListViewRebuildMode::TrimElements)?,
266            )),
267            _ => Ok(self.clone()),
268        }
269    }
270}
271
272// Unwrap canonical type back down to specialized type.
273impl Canonical {
274    pub fn as_null(&self) -> &NullArray {
275        if let Canonical::Null(a) = self {
276            a
277        } else {
278            vortex_panic!("Cannot get NullArray from {:?}", &self)
279        }
280    }
281
282    pub fn into_null(self) -> NullArray {
283        if let Canonical::Null(a) = self {
284            a
285        } else {
286            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
287        }
288    }
289
290    pub fn as_bool(&self) -> &BoolArray {
291        if let Canonical::Bool(a) = self {
292            a
293        } else {
294            vortex_panic!("Cannot get BoolArray from {:?}", &self)
295        }
296    }
297
298    pub fn into_bool(self) -> BoolArray {
299        if let Canonical::Bool(a) = self {
300            a
301        } else {
302            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
303        }
304    }
305
306    pub fn as_primitive(&self) -> &PrimitiveArray {
307        if let Canonical::Primitive(a) = self {
308            a
309        } else {
310            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
311        }
312    }
313
314    pub fn into_primitive(self) -> PrimitiveArray {
315        if let Canonical::Primitive(a) = self {
316            a
317        } else {
318            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
319        }
320    }
321
322    pub fn as_decimal(&self) -> &DecimalArray {
323        if let Canonical::Decimal(a) = self {
324            a
325        } else {
326            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
327        }
328    }
329
330    pub fn into_decimal(self) -> DecimalArray {
331        if let Canonical::Decimal(a) = self {
332            a
333        } else {
334            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
335        }
336    }
337
338    pub fn as_varbinview(&self) -> &VarBinViewArray {
339        if let Canonical::VarBinView(a) = self {
340            a
341        } else {
342            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
343        }
344    }
345
346    pub fn into_varbinview(self) -> VarBinViewArray {
347        if let Canonical::VarBinView(a) = self {
348            a
349        } else {
350            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
351        }
352    }
353
354    pub fn as_listview(&self) -> &ListViewArray {
355        if let Canonical::List(a) = self {
356            a
357        } else {
358            vortex_panic!("Cannot get ListArray from {:?}", &self)
359        }
360    }
361
362    pub fn into_listview(self) -> ListViewArray {
363        if let Canonical::List(a) = self {
364            a
365        } else {
366            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
367        }
368    }
369
370    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
371        if let Canonical::FixedSizeList(a) = self {
372            a
373        } else {
374            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
375        }
376    }
377
378    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
379        if let Canonical::FixedSizeList(a) = self {
380            a
381        } else {
382            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
383        }
384    }
385
386    pub fn as_struct(&self) -> &StructArray {
387        if let Canonical::Struct(a) = self {
388            a
389        } else {
390            vortex_panic!("Cannot get StructArray from {:?}", &self)
391        }
392    }
393
394    pub fn into_struct(self) -> StructArray {
395        if let Canonical::Struct(a) = self {
396            a
397        } else {
398            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
399        }
400    }
401
402    pub fn as_extension(&self) -> &ExtensionArray {
403        if let Canonical::Extension(a) = self {
404            a
405        } else {
406            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
407        }
408    }
409
410    pub fn into_extension(self) -> ExtensionArray {
411        if let Canonical::Extension(a) = self {
412            a
413        } else {
414            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
415        }
416    }
417}
418
419impl IntoArray for Canonical {
420    fn into_array(self) -> ArrayRef {
421        match_each_canonical!(self, |arr| arr.into_array())
422    }
423}
424
425/// Trait for types that can be converted from an owned type into an owned array variant.
426///
427/// # Canonicalization
428///
429/// This trait has a blanket implementation for all types implementing [ToCanonical].
430#[deprecated(note = "use `array.execute::<T>(ctx)` instead")]
431pub trait ToCanonical {
432    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
433    #[deprecated(note = "use `array.execute::<NullArray>(ctx)` instead")]
434    fn to_null(&self) -> NullArray;
435
436    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
437    #[deprecated(note = "use `array.execute::<BoolArray>(ctx)` instead")]
438    fn to_bool(&self) -> BoolArray;
439
440    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
441    /// typed.
442    #[deprecated(note = "use `array.execute::<PrimitiveArray>(ctx)` instead")]
443    fn to_primitive(&self) -> PrimitiveArray;
444
445    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
446    /// typed.
447    #[deprecated(note = "use `array.execute::<DecimalArray>(ctx)` instead")]
448    fn to_decimal(&self) -> DecimalArray;
449
450    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
451    #[deprecated(note = "use `array.execute::<StructArray>(ctx)` instead")]
452    fn to_struct(&self) -> StructArray;
453
454    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
455    #[deprecated(note = "use `array.execute::<ListViewArray>(ctx)` instead")]
456    fn to_listview(&self) -> ListViewArray;
457
458    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
459    /// typed.
460    #[deprecated(note = "use `array.execute::<FixedSizeListArray>(ctx)` instead")]
461    fn to_fixed_size_list(&self) -> FixedSizeListArray;
462
463    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
464    /// or [`Binary`](DType::Binary) typed.
465    #[deprecated(note = "use `array.execute::<VarBinViewArray>(ctx)` instead")]
466    fn to_varbinview(&self) -> VarBinViewArray;
467
468    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
469    /// typed.
470    #[deprecated(note = "use `array.execute::<ExtensionArray>(ctx)` instead")]
471    fn to_extension(&self) -> ExtensionArray;
472}
473
474// Blanket impl for all Array encodings.
475#[expect(deprecated)]
476impl ToCanonical for ArrayRef {
477    fn to_null(&self) -> NullArray {
478        #[expect(deprecated)]
479        let result = self.to_canonical().vortex_expect("to_canonical failed");
480        result.into_null()
481    }
482
483    fn to_bool(&self) -> BoolArray {
484        #[expect(deprecated)]
485        let result = self.to_canonical().vortex_expect("to_canonical failed");
486        result.into_bool()
487    }
488
489    fn to_primitive(&self) -> PrimitiveArray {
490        #[expect(deprecated)]
491        let result = self.to_canonical().vortex_expect("to_canonical failed");
492        result.into_primitive()
493    }
494
495    fn to_decimal(&self) -> DecimalArray {
496        #[expect(deprecated)]
497        let result = self.to_canonical().vortex_expect("to_canonical failed");
498        result.into_decimal()
499    }
500
501    fn to_struct(&self) -> StructArray {
502        #[expect(deprecated)]
503        let result = self.to_canonical().vortex_expect("to_canonical failed");
504        result.into_struct()
505    }
506
507    fn to_listview(&self) -> ListViewArray {
508        #[expect(deprecated)]
509        let result = self.to_canonical().vortex_expect("to_canonical failed");
510        result.into_listview()
511    }
512
513    fn to_fixed_size_list(&self) -> FixedSizeListArray {
514        #[expect(deprecated)]
515        let result = self.to_canonical().vortex_expect("to_canonical failed");
516        result.into_fixed_size_list()
517    }
518
519    fn to_varbinview(&self) -> VarBinViewArray {
520        #[expect(deprecated)]
521        let result = self.to_canonical().vortex_expect("to_canonical failed");
522        result.into_varbinview()
523    }
524
525    fn to_extension(&self) -> ExtensionArray {
526        #[expect(deprecated)]
527        let result = self.to_canonical().vortex_expect("to_canonical failed");
528        result.into_extension()
529    }
530}
531
532impl From<Canonical> for ArrayRef {
533    fn from(value: Canonical) -> Self {
534        match_each_canonical!(value, |arr| arr.into_array())
535    }
536}
537
538/// Execute into [`Canonical`] by running `execute_until` with the [`AnyCanonical`] matcher.
539///
540/// Unlike executing into [`crate::Columnar`], this will fully expand constant arrays into their
541/// canonical form. Callers should prefer to execute into `Columnar` if they are able to optimize
542/// their use for constant arrays.
543impl Executable for Canonical {
544    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
545        let result = array.execute_until::<AnyCanonical>(ctx)?;
546        Ok(result
547            .as_opt::<AnyCanonical>()
548            .map(Canonical::from)
549            .vortex_expect("execute_until::<AnyCanonical> must return a canonical array"))
550    }
551}
552
553/// Recursively execute the array until it reaches canonical form along with its validity.
554///
555/// Callers should prefer to execute into `Columnar` instead of this specific target.
556/// This target is useful when preparing arrays for writing.
557pub struct CanonicalValidity(pub Canonical);
558
559impl Executable for CanonicalValidity {
560    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
561        match array.execute::<Canonical>(ctx)? {
562            n @ Canonical::Null(_) => Ok(CanonicalValidity(n)),
563            Canonical::Bool(b) => {
564                let validity = child_to_validity(b.slots()[0].as_ref(), b.dtype().nullability());
565                let len = b.len();
566                let BoolDataParts { bits, offset, len } = b.into_data().into_parts(len);
567                Ok(CanonicalValidity(Canonical::Bool(
568                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
569                )))
570            }
571            Canonical::Primitive(p) => {
572                let PrimitiveDataParts {
573                    ptype,
574                    buffer,
575                    validity,
576                } = p.into_data_parts();
577                Ok(CanonicalValidity(Canonical::Primitive(unsafe {
578                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
579                })))
580            }
581            Canonical::Decimal(d) => {
582                let DecimalDataParts {
583                    decimal_dtype,
584                    values,
585                    values_type,
586                    validity,
587                } = d.into_data_parts();
588                Ok(CanonicalValidity(Canonical::Decimal(unsafe {
589                    DecimalArray::new_unchecked_handle(
590                        values,
591                        values_type,
592                        decimal_dtype,
593                        validity.execute(ctx)?,
594                    )
595                })))
596            }
597            Canonical::VarBinView(vbv) => {
598                let VarBinViewDataParts {
599                    dtype,
600                    buffers,
601                    views,
602                    validity,
603                } = vbv.into_data_parts();
604                Ok(CanonicalValidity(Canonical::VarBinView(unsafe {
605                    VarBinViewArray::new_handle_unchecked(
606                        views,
607                        buffers,
608                        dtype,
609                        validity.execute(ctx)?,
610                    )
611                })))
612            }
613            Canonical::List(l) => {
614                let zctl = l.is_zero_copy_to_list();
615                let ListViewDataParts {
616                    elements,
617                    offsets,
618                    sizes,
619                    validity,
620                    ..
621                } = l.into_data_parts();
622                Ok(CanonicalValidity(Canonical::List(unsafe {
623                    ListViewArray::new_unchecked(elements, offsets, sizes, validity.execute(ctx)?)
624                        .with_zero_copy_to_list(zctl)
625                })))
626            }
627            Canonical::FixedSizeList(fsl) => {
628                let list_size = fsl.list_size();
629                let len = fsl.len();
630                let parts = fsl.into_data_parts();
631                let elements = parts.elements;
632                let validity = parts.validity;
633                Ok(CanonicalValidity(Canonical::FixedSizeList(
634                    FixedSizeListArray::new(elements, list_size, validity.execute(ctx)?, len),
635                )))
636            }
637            Canonical::Struct(st) => {
638                let len = st.len();
639                let StructDataParts {
640                    struct_fields,
641                    fields,
642                    validity,
643                } = st.into_data_parts();
644                Ok(CanonicalValidity(Canonical::Struct(unsafe {
645                    StructArray::new_unchecked(fields, struct_fields, len, validity.execute(ctx)?)
646                })))
647            }
648            Canonical::Extension(ext) => Ok(CanonicalValidity(Canonical::Extension(
649                ExtensionArray::new(
650                    ext.ext_dtype().clone(),
651                    ext.storage_array()
652                        .clone()
653                        .execute::<CanonicalValidity>(ctx)?
654                        .0
655                        .into_array(),
656                ),
657            ))),
658            Canonical::Variant(variant) => {
659                let core_storage = recursively_canonicalize_slots(variant.core_storage(), ctx)?;
660                let shredded = variant
661                    .shredded()
662                    .map(|shredded| {
663                        shredded
664                            .clone()
665                            .execute::<CanonicalValidity>(ctx)
666                            .map(|canonical| canonical.0.into_array())
667                    })
668                    .transpose()?;
669                Ok(CanonicalValidity(Canonical::Variant(
670                    VariantArray::try_new(core_storage, shredded)?,
671                )))
672            }
673        }
674    }
675}
676
677/// Recursively execute the array until all of its children are canonical.
678///
679/// This method is useful to guarantee that all operators are fully executed,
680/// callers should prefer an execution target that's suitable for their use case instead of this one.
681pub struct RecursiveCanonical(pub Canonical);
682
683// TODO: Currently only used for Variant, in the future
684// can probably be used for more canonical types like Struct.
685fn recursively_canonicalize_slots(
686    array: &ArrayRef,
687    ctx: &mut ExecutionCtx,
688) -> VortexResult<ArrayRef> {
689    let slots = array
690        .slots()
691        .iter()
692        .map(|slot| {
693            slot.as_ref()
694                .map(|child| {
695                    child
696                        .clone()
697                        .execute::<RecursiveCanonical>(ctx)
698                        .map(|canonical| canonical.0.into_array())
699                })
700                .transpose()
701        })
702        .collect::<VortexResult<ArraySlots>>()?;
703    array.clone().with_slots(slots)
704}
705
706impl Executable for RecursiveCanonical {
707    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
708        match array.execute::<Canonical>(ctx)? {
709            n @ Canonical::Null(_) => Ok(RecursiveCanonical(n)),
710            Canonical::Bool(b) => {
711                let validity = child_to_validity(b.slots()[0].as_ref(), b.dtype().nullability());
712                let len = b.len();
713                let BoolDataParts { bits, offset, len } = b.into_data().into_parts(len);
714                Ok(RecursiveCanonical(Canonical::Bool(
715                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
716                )))
717            }
718            Canonical::Primitive(p) => {
719                let PrimitiveDataParts {
720                    ptype,
721                    buffer,
722                    validity,
723                } = p.into_data_parts();
724                Ok(RecursiveCanonical(Canonical::Primitive(unsafe {
725                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
726                })))
727            }
728            Canonical::Decimal(d) => {
729                let DecimalDataParts {
730                    decimal_dtype,
731                    values,
732                    values_type,
733                    validity,
734                } = d.into_data_parts();
735                Ok(RecursiveCanonical(Canonical::Decimal(unsafe {
736                    DecimalArray::new_unchecked_handle(
737                        values,
738                        values_type,
739                        decimal_dtype,
740                        validity.execute(ctx)?,
741                    )
742                })))
743            }
744            Canonical::VarBinView(vbv) => {
745                let VarBinViewDataParts {
746                    dtype,
747                    buffers,
748                    views,
749                    validity,
750                } = vbv.into_data_parts();
751                Ok(RecursiveCanonical(Canonical::VarBinView(unsafe {
752                    VarBinViewArray::new_handle_unchecked(
753                        views,
754                        buffers,
755                        dtype,
756                        validity.execute(ctx)?,
757                    )
758                })))
759            }
760            Canonical::List(l) => {
761                let zctl = l.is_zero_copy_to_list();
762                let ListViewDataParts {
763                    elements,
764                    offsets,
765                    sizes,
766                    validity,
767                    ..
768                } = l.into_data_parts();
769                Ok(RecursiveCanonical(Canonical::List(unsafe {
770                    ListViewArray::new_unchecked(
771                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
772                        offsets.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
773                        sizes.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
774                        validity.execute(ctx)?,
775                    )
776                    .with_zero_copy_to_list(zctl)
777                })))
778            }
779            Canonical::FixedSizeList(fsl) => {
780                let list_size = fsl.list_size();
781                let len = fsl.len();
782                let parts = fsl.into_data_parts();
783                let elements = parts.elements;
784                let validity = parts.validity;
785                Ok(RecursiveCanonical(Canonical::FixedSizeList(
786                    FixedSizeListArray::new(
787                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
788                        list_size,
789                        validity.execute(ctx)?,
790                        len,
791                    ),
792                )))
793            }
794            Canonical::Struct(st) => {
795                let len = st.len();
796                let StructDataParts {
797                    struct_fields,
798                    fields,
799                    validity,
800                } = st.into_data_parts();
801                let executed_fields = fields
802                    .iter()
803                    .map(|f| Ok(f.clone().execute::<RecursiveCanonical>(ctx)?.0.into_array()))
804                    .collect::<VortexResult<Arc<[_]>>>()?;
805
806                Ok(RecursiveCanonical(Canonical::Struct(unsafe {
807                    StructArray::new_unchecked(
808                        executed_fields,
809                        struct_fields,
810                        len,
811                        validity.execute(ctx)?,
812                    )
813                })))
814            }
815            Canonical::Extension(ext) => Ok(RecursiveCanonical(Canonical::Extension(
816                ExtensionArray::new(
817                    ext.ext_dtype().clone(),
818                    ext.storage_array()
819                        .clone()
820                        .execute::<RecursiveCanonical>(ctx)?
821                        .0
822                        .into_array(),
823                ),
824            ))),
825            Canonical::Variant(variant) => {
826                let core_storage = recursively_canonicalize_slots(variant.core_storage(), ctx)?;
827                let shredded = variant
828                    .shredded()
829                    .map(|shredded| {
830                        shredded
831                            .clone()
832                            .execute::<RecursiveCanonical>(ctx)
833                            .map(|canonical| canonical.0.into_array())
834                    })
835                    .transpose()?;
836                Ok(RecursiveCanonical(Canonical::Variant(
837                    VariantArray::try_new(core_storage, shredded)?,
838                )))
839            }
840        }
841    }
842}
843
844/// Execute a primitive typed array into a buffer of native values, assuming all values are valid.
845///
846/// # Errors
847///
848/// Returns a `VortexError` if the array is not all-valid (has any nulls).
849impl<T: NativePType> Executable for Buffer<T> {
850    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
851        let array = PrimitiveArray::execute(array, ctx)?;
852        vortex_ensure!(
853            matches!(
854                array.validity()?,
855                Validity::NonNullable | Validity::AllValid
856            ),
857            "Cannot execute to native buffer: array is not all-valid."
858        );
859        Ok(array.into_buffer())
860    }
861}
862
863/// Execute the array to canonical form and unwrap as a [`PrimitiveArray`].
864///
865/// This will panic if the array's dtype is not primitive.
866impl Executable for PrimitiveArray {
867    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
868        match array.try_downcast::<Primitive>() {
869            Ok(primitive) => Ok(primitive),
870            Err(array) => Ok(Canonical::execute(array, ctx)?.into_primitive()),
871        }
872    }
873}
874
875/// Execute the array to canonical form and unwrap as a [`BoolArray`].
876///
877/// This will panic if the array's dtype is not bool.
878impl Executable for BoolArray {
879    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
880        match array.try_downcast::<Bool>() {
881            Ok(bool_array) => Ok(bool_array),
882            Err(array) => Ok(Canonical::execute(array, ctx)?.into_bool()),
883        }
884    }
885}
886
887/// Execute the array to a [`BitBuffer`], aka a non-nullable  [`BoolArray`].
888///
889/// This will panic if the array's dtype is not non-nullable bool.
890impl Executable for BitBuffer {
891    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
892        let bool = BoolArray::execute(array, ctx)?;
893        assert!(
894            !bool.dtype().is_nullable(),
895            "bit buffer execute only works with non-nullable bool arrays"
896        );
897        Ok(bool.into_bit_buffer())
898    }
899}
900
901/// Execute the array to canonical form and unwrap as a [`NullArray`].
902///
903/// This will panic if the array's dtype is not null.
904impl Executable for NullArray {
905    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
906        match array.try_downcast::<Null>() {
907            Ok(null_array) => Ok(null_array),
908            Err(array) => Ok(Canonical::execute(array, ctx)?.into_null()),
909        }
910    }
911}
912
913/// Execute the array to canonical form and unwrap as a [`VarBinViewArray`].
914///
915/// This will panic if the array's dtype is not utf8 or binary.
916impl Executable for VarBinViewArray {
917    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
918        match array.try_downcast::<VarBinView>() {
919            Ok(varbinview) => Ok(varbinview),
920            Err(array) => Ok(Canonical::execute(array, ctx)?.into_varbinview()),
921        }
922    }
923}
924
925/// Execute the array to canonical form and unwrap as an [`ExtensionArray`].
926///
927/// This will panic if the array's dtype is not an extension type.
928impl Executable for ExtensionArray {
929    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
930        match array.try_downcast::<Extension>() {
931            Ok(ext_array) => Ok(ext_array),
932            Err(array) => Ok(Canonical::execute(array, ctx)?.into_extension()),
933        }
934    }
935}
936
937/// Execute the array to canonical form and unwrap as a [`DecimalArray`].
938///
939/// This will panic if the array's dtype is not decimal.
940impl Executable for DecimalArray {
941    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
942        match array.try_downcast::<Decimal>() {
943            Ok(decimal) => Ok(decimal),
944            Err(array) => Ok(Canonical::execute(array, ctx)?.into_decimal()),
945        }
946    }
947}
948
949/// Execute the array to canonical form and unwrap as a [`ListViewArray`].
950///
951/// This will panic if the array's dtype is not list.
952impl Executable for ListViewArray {
953    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
954        match array.try_downcast::<ListView>() {
955            Ok(list) => Ok(list),
956            Err(array) => Ok(Canonical::execute(array, ctx)?.into_listview()),
957        }
958    }
959}
960
961/// Execute the array to canonical form and unwrap as a [`FixedSizeListArray`].
962///
963/// This will panic if the array's dtype is not fixed size list.
964impl Executable for FixedSizeListArray {
965    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
966        match array.try_downcast::<FixedSizeList>() {
967            Ok(fsl) => Ok(fsl),
968            Err(array) => Ok(Canonical::execute(array, ctx)?.into_fixed_size_list()),
969        }
970    }
971}
972
973/// Execute the array to canonical form and unwrap as a [`StructArray`].
974///
975/// This will panic if the array's dtype is not struct.
976impl Executable for StructArray {
977    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
978        match array.try_downcast::<Struct>() {
979            Ok(struct_array) => Ok(struct_array),
980            Err(array) => Ok(Canonical::execute(array, ctx)?.into_struct()),
981        }
982    }
983}
984
985/// Execute the array to canonical form and unwrap as a [`VariantArray`].
986///
987/// This will panic if the array's dtype is not variant.
988impl Executable for VariantArray {
989    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
990        match array.try_downcast::<Variant>() {
991            Ok(variant_array) => Ok(variant_array),
992            Err(array) => match Canonical::execute(array, ctx)? {
993                Canonical::Variant(variant_array) => Ok(variant_array),
994                canonical => vortex_panic!("Cannot unwrap VariantArray from {:?}", canonical),
995            },
996        }
997    }
998}
999
1000/// A view into a canonical array type.
1001///
1002/// Uses `ArrayView<V>` because these are obtained by
1003/// downcasting through the `Matcher` trait which returns `ArrayView<V>`.
1004#[derive(Debug, Clone, Copy)]
1005pub enum CanonicalView<'a> {
1006    Null(ArrayView<'a, Null>),
1007    Bool(ArrayView<'a, Bool>),
1008    Primitive(ArrayView<'a, Primitive>),
1009    Decimal(ArrayView<'a, Decimal>),
1010    VarBinView(ArrayView<'a, VarBinView>),
1011    List(ArrayView<'a, ListView>),
1012    FixedSizeList(ArrayView<'a, FixedSizeList>),
1013    Struct(ArrayView<'a, Struct>),
1014    Extension(ArrayView<'a, Extension>),
1015    Variant(ArrayView<'a, Variant>),
1016}
1017
1018impl From<CanonicalView<'_>> for Canonical {
1019    fn from(value: CanonicalView<'_>) -> Self {
1020        match value {
1021            CanonicalView::Null(a) => Canonical::Null(a.into_owned()),
1022            CanonicalView::Bool(a) => Canonical::Bool(a.into_owned()),
1023            CanonicalView::Primitive(a) => Canonical::Primitive(a.into_owned()),
1024            CanonicalView::Decimal(a) => Canonical::Decimal(a.into_owned()),
1025            CanonicalView::VarBinView(a) => Canonical::VarBinView(a.into_owned()),
1026            CanonicalView::List(a) => Canonical::List(a.into_owned()),
1027            CanonicalView::FixedSizeList(a) => Canonical::FixedSizeList(a.into_owned()),
1028            CanonicalView::Struct(a) => Canonical::Struct(a.into_owned()),
1029            CanonicalView::Extension(a) => Canonical::Extension(a.into_owned()),
1030            CanonicalView::Variant(a) => Canonical::Variant(a.into_owned()),
1031        }
1032    }
1033}
1034
1035impl CanonicalView<'_> {
1036    /// Convert to a type-erased [`ArrayRef`].
1037    pub fn to_array_ref(&self) -> ArrayRef {
1038        match self {
1039            CanonicalView::Null(a) => a.array().clone(),
1040            CanonicalView::Bool(a) => a.array().clone(),
1041            CanonicalView::Primitive(a) => a.array().clone(),
1042            CanonicalView::Decimal(a) => a.array().clone(),
1043            CanonicalView::VarBinView(a) => a.array().clone(),
1044            CanonicalView::List(a) => a.array().clone(),
1045            CanonicalView::FixedSizeList(a) => a.array().clone(),
1046            CanonicalView::Struct(a) => a.array().clone(),
1047            CanonicalView::Extension(a) => a.array().clone(),
1048            CanonicalView::Variant(a) => a.array().clone(),
1049        }
1050    }
1051}
1052
1053/// A matcher for any canonical array type.
1054pub struct AnyCanonical;
1055impl Matcher for AnyCanonical {
1056    type Match<'a> = CanonicalView<'a>;
1057
1058    fn matches(array: &ArrayRef) -> bool {
1059        array.is::<Null>()
1060            || array.is::<Bool>()
1061            || array.is::<Primitive>()
1062            || array.is::<Decimal>()
1063            || array.is::<Struct>()
1064            || array.is::<ListView>()
1065            || array.is::<FixedSizeList>()
1066            || array.is::<VarBinView>()
1067            || array.is::<Variant>()
1068            || array.is::<Extension>()
1069    }
1070
1071    fn try_match(array: &ArrayRef) -> Option<Self::Match<'_>> {
1072        if let Some(a) = array.as_opt::<Null>() {
1073            Some(CanonicalView::Null(a))
1074        } else if let Some(a) = array.as_opt::<Bool>() {
1075            Some(CanonicalView::Bool(a))
1076        } else if let Some(a) = array.as_opt::<Primitive>() {
1077            Some(CanonicalView::Primitive(a))
1078        } else if let Some(a) = array.as_opt::<Decimal>() {
1079            Some(CanonicalView::Decimal(a))
1080        } else if let Some(a) = array.as_opt::<Struct>() {
1081            Some(CanonicalView::Struct(a))
1082        } else if let Some(a) = array.as_opt::<ListView>() {
1083            Some(CanonicalView::List(a))
1084        } else if let Some(a) = array.as_opt::<FixedSizeList>() {
1085            Some(CanonicalView::FixedSizeList(a))
1086        } else if let Some(a) = array.as_opt::<VarBinView>() {
1087            Some(CanonicalView::VarBinView(a))
1088        } else if let Some(a) = array.as_opt::<Variant>() {
1089            Some(CanonicalView::Variant(a))
1090        } else {
1091            array.as_opt::<Extension>().map(CanonicalView::Extension)
1092        }
1093    }
1094}
1095
1096#[cfg(test)]
1097mod test {
1098    use std::sync::Arc;
1099
1100    use arrow_array::Array as ArrowArray;
1101    use arrow_array::ArrayRef as ArrowArrayRef;
1102    use arrow_array::ListArray as ArrowListArray;
1103    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
1104    use arrow_array::StringArray;
1105    use arrow_array::StringViewArray;
1106    use arrow_array::StructArray as ArrowStructArray;
1107    use arrow_array::cast::AsArray;
1108    use arrow_array::types::Int32Type;
1109    use arrow_array::types::Int64Type;
1110    use arrow_array::types::UInt64Type;
1111    use arrow_buffer::NullBufferBuilder;
1112    use arrow_buffer::OffsetBuffer;
1113    use arrow_schema::DataType;
1114    use arrow_schema::Field;
1115    use vortex_buffer::buffer;
1116
1117    use crate::ArrayRef;
1118    use crate::IntoArray;
1119    use crate::LEGACY_SESSION;
1120    use crate::VortexSessionExecute;
1121    use crate::arrays::ConstantArray;
1122    use crate::arrow::ArrowSessionExt;
1123    use crate::arrow::FromArrowArray;
1124    use crate::canonical::StructArray;
1125
1126    #[test]
1127    fn test_canonicalize_nested_struct() {
1128        let mut ctx = LEGACY_SESSION.create_execution_ctx();
1129        // Create a struct array with multiple internal components.
1130        let nested_struct_array = StructArray::from_fields(&[
1131            ("a", buffer![1u64].into_array()),
1132            (
1133                "b",
1134                StructArray::from_fields(&[(
1135                    "inner_a",
1136                    // The nested struct contains a ConstantArray representing the primitive array
1137                    //   [100i64]
1138                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
1139                    // map this to the nearest canonical type (PrimitiveArray).
1140                    ConstantArray::new(100i64, 1).into_array(),
1141                )])
1142                .unwrap()
1143                .into_array(),
1144            ),
1145        ])
1146        .unwrap();
1147
1148        let arrow_struct = LEGACY_SESSION
1149            .arrow()
1150            .execute_arrow(nested_struct_array.into_array(), None, &mut ctx)
1151            .unwrap()
1152            .as_any()
1153            .downcast_ref::<ArrowStructArray>()
1154            .cloned()
1155            .unwrap();
1156
1157        assert!(
1158            arrow_struct
1159                .column(0)
1160                .as_any()
1161                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1162                .is_some()
1163        );
1164
1165        let inner_struct = Arc::clone(arrow_struct.column(1))
1166            .as_any()
1167            .downcast_ref::<ArrowStructArray>()
1168            .cloned()
1169            .unwrap();
1170
1171        let inner_a = inner_struct
1172            .column(0)
1173            .as_any()
1174            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1175        assert!(inner_a.is_some());
1176
1177        assert_eq!(
1178            inner_a.cloned().unwrap(),
1179            ArrowPrimitiveArray::from_iter([100i64])
1180        );
1181    }
1182
1183    #[test]
1184    fn roundtrip_struct() {
1185        let mut ctx = LEGACY_SESSION.create_execution_ctx();
1186        let mut nulls = NullBufferBuilder::new(6);
1187        nulls.append_n_non_nulls(4);
1188        nulls.append_null();
1189        nulls.append_non_null();
1190        let names = Arc::new(StringViewArray::from_iter(vec![
1191            Some("Joseph"),
1192            None,
1193            Some("Angela"),
1194            Some("Mikhail"),
1195            None,
1196            None,
1197        ]));
1198        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1199            Some(25),
1200            Some(31),
1201            None,
1202            Some(57),
1203            None,
1204            None,
1205        ]));
1206
1207        let arrow_struct = ArrowStructArray::new(
1208            vec![
1209                Arc::new(Field::new("name", DataType::Utf8View, true)),
1210                Arc::new(Field::new("age", DataType::Int32, true)),
1211            ]
1212            .into(),
1213            vec![names, ages],
1214            nulls.finish(),
1215        );
1216
1217        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true).unwrap();
1218        let vortex_struct = LEGACY_SESSION
1219            .arrow()
1220            .execute_arrow(vortex_struct, None, &mut ctx)
1221            .unwrap();
1222        assert_eq!(&arrow_struct, vortex_struct.as_struct());
1223    }
1224
1225    #[test]
1226    fn roundtrip_list() {
1227        let mut ctx = LEGACY_SESSION.create_execution_ctx();
1228        let names = Arc::new(StringArray::from_iter(vec![
1229            Some("Joseph"),
1230            Some("Angela"),
1231            Some("Mikhail"),
1232        ]));
1233
1234        let arrow_list = ArrowListArray::new(
1235            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1236            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1237            names,
1238            None,
1239        );
1240        let list_data_type = arrow_list.data_type();
1241        let list_field = Field::new(String::new(), list_data_type.clone(), true);
1242
1243        let vortex_list = ArrayRef::from_arrow(&arrow_list, true).unwrap();
1244
1245        let rt_arrow_list = LEGACY_SESSION
1246            .arrow()
1247            .execute_arrow(vortex_list, Some(&list_field), &mut ctx)
1248            .unwrap();
1249
1250        assert_eq!(
1251            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1252            rt_arrow_list.as_ref()
1253        );
1254    }
1255}