Skip to main content

vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use std::sync::Arc;
7
8use vortex_buffer::BitBuffer;
9use vortex_buffer::Buffer;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_panic;
14
15use crate::ArrayRef;
16use crate::DynArray;
17use crate::Executable;
18use crate::ExecutionCtx;
19use crate::IntoArray;
20use crate::arrays::BoolArray;
21use crate::arrays::BoolVTable;
22use crate::arrays::DecimalArray;
23use crate::arrays::DecimalVTable;
24use crate::arrays::ExtensionArray;
25use crate::arrays::ExtensionVTable;
26use crate::arrays::FixedSizeListArray;
27use crate::arrays::FixedSizeListVTable;
28use crate::arrays::ListViewArray;
29use crate::arrays::ListViewVTable;
30use crate::arrays::NullArray;
31use crate::arrays::NullVTable;
32use crate::arrays::PrimitiveArray;
33use crate::arrays::PrimitiveVTable;
34use crate::arrays::StructArray;
35use crate::arrays::StructVTable;
36use crate::arrays::VarBinViewArray;
37use crate::arrays::VarBinViewVTable;
38use crate::arrays::bool::BoolArrayParts;
39use crate::arrays::decimal::DecimalArrayParts;
40use crate::arrays::listview::ListViewArrayParts;
41use crate::arrays::listview::ListViewRebuildMode;
42use crate::arrays::primitive::PrimitiveArrayParts;
43use crate::arrays::struct_::StructArrayParts;
44use crate::arrays::varbinview::VarBinViewArrayParts;
45use crate::dtype::DType;
46use crate::dtype::NativePType;
47use crate::dtype::Nullability;
48use crate::dtype::PType;
49use crate::match_each_decimal_value_type;
50use crate::match_each_native_ptype;
51use crate::matcher::Matcher;
52use crate::validity::Validity;
53
54/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
55///
56/// Any array can be decoded into canonical form via the [`to_canonical`](DynArray::to_canonical)
57/// trait method. This is the simplest encoding for a type, and will not be compressed but may
58/// contain compressed child arrays.
59///
60/// Canonical form is useful for doing type-specific compute where you need to know that all
61/// elements are laid out decompressed and contiguous in memory.
62///
63/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
64/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
65/// [`DType::Binary`].
66///
67/// # Laziness
68///
69/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
70/// `Struct` type, individual column child arrays may still be compressed. This allows
71/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
72/// never need to be decoded into canonical form at all depending on the compute.
73///
74/// # Arrow interoperability
75///
76/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
77/// zero-copy, and the corresponding Arrow array types can also be built directly.
78///
79/// The full list of canonical types and their equivalent Arrow array types are:
80///
81/// * `NullArray`: [`arrow_array::NullArray`]
82/// * `BoolArray`: [`arrow_array::BooleanArray`]
83/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
84/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
85/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
86/// * `ListViewArray`: [`arrow_array::ListViewArray`]
87/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
88/// * `StructArray`: [`arrow_array::StructArray`]
89///
90/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
91/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
92/// create ambiguity.
93/// Thus, if you receive an Arrow array, compress it using Vortex, and then
94/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
95/// variants to hold the data.
96///
97/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
98/// will correspond to an arrow-rs [`arrow_schema::DataType`].
99///
100/// # Views support
101///
102/// Binary and String views, also known as "German strings" are a better encoding format for
103/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
104/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
105/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
106/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
107/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
108/// from memory.
109///
110/// # For Developers
111///
112/// If you add another variant to this enum, make sure to update `dyn Array::is_canonical`,
113/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
114#[derive(Debug, Clone)]
115pub enum Canonical {
116    Null(NullArray),
117    Bool(BoolArray),
118    Primitive(PrimitiveArray),
119    Decimal(DecimalArray),
120    VarBinView(VarBinViewArray),
121    List(ListViewArray),
122    FixedSizeList(FixedSizeListArray),
123    Struct(StructArray),
124    Extension(ExtensionArray),
125}
126
127/// Match on every canonical variant and evaluate a code block on all variants
128macro_rules! match_each_canonical {
129    ($self:expr, | $ident:ident | $eval:expr) => {{
130        match $self {
131            Canonical::Null($ident) => $eval,
132            Canonical::Bool($ident) => $eval,
133            Canonical::Primitive($ident) => $eval,
134            Canonical::Decimal($ident) => $eval,
135            Canonical::VarBinView($ident) => $eval,
136            Canonical::List($ident) => $eval,
137            Canonical::FixedSizeList($ident) => $eval,
138            Canonical::Struct($ident) => $eval,
139            Canonical::Extension($ident) => $eval,
140        }
141    }};
142}
143
144impl Canonical {
145    /// Create an empty canonical array of the given dtype.
146    pub fn empty(dtype: &DType) -> Canonical {
147        match dtype {
148            DType::Null => Canonical::Null(NullArray::new(0)),
149            DType::Bool(n) => Canonical::Bool(unsafe {
150                BoolArray::new_unchecked(BitBuffer::empty(), Validity::from(n))
151            }),
152            DType::Primitive(ptype, n) => {
153                match_each_native_ptype!(ptype, |P| {
154                    Canonical::Primitive(unsafe {
155                        PrimitiveArray::new_unchecked(Buffer::<P>::empty(), Validity::from(n))
156                    })
157                })
158            }
159            DType::Decimal(decimal_type, n) => {
160                match_each_decimal_value_type!(
161                    DecimalType::smallest_decimal_value_type(decimal_type),
162                    |D| {
163                        Canonical::Decimal(unsafe {
164                            DecimalArray::new_unchecked::<D>(
165                                Buffer::empty(),
166                                *decimal_type,
167                                Validity::from(n),
168                            )
169                        })
170                    }
171                )
172            }
173            DType::Utf8(n) => Canonical::VarBinView(unsafe {
174                VarBinViewArray::new_unchecked(
175                    Buffer::empty(),
176                    Arc::new([]),
177                    dtype.clone(),
178                    Validity::from(n),
179                )
180            }),
181            DType::Binary(n) => Canonical::VarBinView(unsafe {
182                VarBinViewArray::new_unchecked(
183                    Buffer::empty(),
184                    Arc::new([]),
185                    dtype.clone(),
186                    Validity::from(n),
187                )
188            }),
189            DType::Struct(struct_dtype, n) => Canonical::Struct(unsafe {
190                StructArray::new_unchecked(
191                    struct_dtype
192                        .fields()
193                        .map(|f| Canonical::empty(&f).into_array())
194                        .collect::<Arc<[_]>>(),
195                    struct_dtype.clone(),
196                    0,
197                    Validity::from(n),
198                )
199            }),
200            DType::List(dtype, n) => Canonical::List(unsafe {
201                ListViewArray::new_unchecked(
202                    Canonical::empty(dtype).into_array(),
203                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
204                        .into_array(),
205                    Canonical::empty(&DType::Primitive(PType::U8, Nullability::NonNullable))
206                        .into_array(),
207                    Validity::from(n),
208                )
209                .with_zero_copy_to_list(true)
210            }),
211            DType::FixedSizeList(elem_dtype, list_size, null) => Canonical::FixedSizeList(unsafe {
212                FixedSizeListArray::new_unchecked(
213                    Canonical::empty(elem_dtype).into_array(),
214                    *list_size,
215                    Validity::from(null),
216                    0,
217                )
218            }),
219            DType::Extension(ext_dtype) => Canonical::Extension(ExtensionArray::new(
220                ext_dtype.clone(),
221                Canonical::empty(ext_dtype.storage_dtype()).into_array(),
222            )),
223        }
224    }
225
226    pub fn len(&self) -> usize {
227        match_each_canonical!(self, |arr| arr.len())
228    }
229
230    pub fn dtype(&self) -> &DType {
231        match_each_canonical!(self, |arr| arr.dtype())
232    }
233
234    pub fn is_empty(&self) -> bool {
235        match_each_canonical!(self, |arr| arr.is_empty())
236    }
237}
238
239impl Canonical {
240    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
241    ///
242    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
243    /// they can accumulate wasted space after slicing and taking operations.
244    ///
245    /// This operation is very expensive and can result in things like allocations, full-scans
246    /// and copy operations.
247    pub fn compact(&self) -> VortexResult<Canonical> {
248        match self {
249            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
250            Canonical::List(array) => Ok(Canonical::List(
251                array.rebuild(ListViewRebuildMode::TrimElements)?,
252            )),
253            _ => Ok(self.clone()),
254        }
255    }
256}
257
258// Unwrap canonical type back down to specialized type.
259impl Canonical {
260    pub fn as_null(&self) -> &NullArray {
261        if let Canonical::Null(a) = self {
262            a
263        } else {
264            vortex_panic!("Cannot get NullArray from {:?}", &self)
265        }
266    }
267
268    pub fn into_null(self) -> NullArray {
269        if let Canonical::Null(a) = self {
270            a
271        } else {
272            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
273        }
274    }
275
276    pub fn as_bool(&self) -> &BoolArray {
277        if let Canonical::Bool(a) = self {
278            a
279        } else {
280            vortex_panic!("Cannot get BoolArray from {:?}", &self)
281        }
282    }
283
284    pub fn into_bool(self) -> BoolArray {
285        if let Canonical::Bool(a) = self {
286            a
287        } else {
288            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
289        }
290    }
291
292    pub fn as_primitive(&self) -> &PrimitiveArray {
293        if let Canonical::Primitive(a) = self {
294            a
295        } else {
296            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
297        }
298    }
299
300    pub fn into_primitive(self) -> PrimitiveArray {
301        if let Canonical::Primitive(a) = self {
302            a
303        } else {
304            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
305        }
306    }
307
308    pub fn as_decimal(&self) -> &DecimalArray {
309        if let Canonical::Decimal(a) = self {
310            a
311        } else {
312            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
313        }
314    }
315
316    pub fn into_decimal(self) -> DecimalArray {
317        if let Canonical::Decimal(a) = self {
318            a
319        } else {
320            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
321        }
322    }
323
324    pub fn as_varbinview(&self) -> &VarBinViewArray {
325        if let Canonical::VarBinView(a) = self {
326            a
327        } else {
328            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
329        }
330    }
331
332    pub fn into_varbinview(self) -> VarBinViewArray {
333        if let Canonical::VarBinView(a) = self {
334            a
335        } else {
336            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
337        }
338    }
339
340    pub fn as_listview(&self) -> &ListViewArray {
341        if let Canonical::List(a) = self {
342            a
343        } else {
344            vortex_panic!("Cannot get ListArray from {:?}", &self)
345        }
346    }
347
348    pub fn into_listview(self) -> ListViewArray {
349        if let Canonical::List(a) = self {
350            a
351        } else {
352            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
353        }
354    }
355
356    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
357        if let Canonical::FixedSizeList(a) = self {
358            a
359        } else {
360            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
361        }
362    }
363
364    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
365        if let Canonical::FixedSizeList(a) = self {
366            a
367        } else {
368            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
369        }
370    }
371
372    pub fn as_struct(&self) -> &StructArray {
373        if let Canonical::Struct(a) = self {
374            a
375        } else {
376            vortex_panic!("Cannot get StructArray from {:?}", &self)
377        }
378    }
379
380    pub fn into_struct(self) -> StructArray {
381        if let Canonical::Struct(a) = self {
382            a
383        } else {
384            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
385        }
386    }
387
388    pub fn as_extension(&self) -> &ExtensionArray {
389        if let Canonical::Extension(a) = self {
390            a
391        } else {
392            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
393        }
394    }
395
396    pub fn into_extension(self) -> ExtensionArray {
397        if let Canonical::Extension(a) = self {
398            a
399        } else {
400            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
401        }
402    }
403}
404
405impl AsRef<dyn DynArray> for Canonical {
406    fn as_ref(&self) -> &(dyn DynArray + 'static) {
407        match_each_canonical!(self, |arr| arr.as_ref())
408    }
409}
410
411impl IntoArray for Canonical {
412    fn into_array(self) -> ArrayRef {
413        match_each_canonical!(self, |arr| arr.into_array())
414    }
415}
416
417/// Trait for types that can be converted from an owned type into an owned array variant.
418///
419/// # Canonicalization
420///
421/// This trait has a blanket implementation for all types implementing [ToCanonical].
422pub trait ToCanonical {
423    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
424    fn to_null(&self) -> NullArray;
425
426    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
427    fn to_bool(&self) -> BoolArray;
428
429    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
430    /// typed.
431    fn to_primitive(&self) -> PrimitiveArray;
432
433    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
434    /// typed.
435    fn to_decimal(&self) -> DecimalArray;
436
437    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
438    fn to_struct(&self) -> StructArray;
439
440    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
441    fn to_listview(&self) -> ListViewArray;
442
443    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
444    /// typed.
445    fn to_fixed_size_list(&self) -> FixedSizeListArray;
446
447    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
448    /// or [`Binary`](DType::Binary) typed.
449    fn to_varbinview(&self) -> VarBinViewArray;
450
451    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
452    /// typed.
453    fn to_extension(&self) -> ExtensionArray;
454}
455
456// Blanket impl for all Array encodings.
457impl<A: DynArray + ?Sized> ToCanonical for A {
458    fn to_null(&self) -> NullArray {
459        self.to_canonical()
460            .vortex_expect("to_canonical failed")
461            .into_null()
462    }
463
464    fn to_bool(&self) -> BoolArray {
465        self.to_canonical()
466            .vortex_expect("to_canonical failed")
467            .into_bool()
468    }
469
470    fn to_primitive(&self) -> PrimitiveArray {
471        self.to_canonical()
472            .vortex_expect("to_canonical failed")
473            .into_primitive()
474    }
475
476    fn to_decimal(&self) -> DecimalArray {
477        self.to_canonical()
478            .vortex_expect("to_canonical failed")
479            .into_decimal()
480    }
481
482    fn to_struct(&self) -> StructArray {
483        self.to_canonical()
484            .vortex_expect("to_canonical failed")
485            .into_struct()
486    }
487
488    fn to_listview(&self) -> ListViewArray {
489        self.to_canonical()
490            .vortex_expect("to_canonical failed")
491            .into_listview()
492    }
493
494    fn to_fixed_size_list(&self) -> FixedSizeListArray {
495        self.to_canonical()
496            .vortex_expect("to_canonical failed")
497            .into_fixed_size_list()
498    }
499
500    fn to_varbinview(&self) -> VarBinViewArray {
501        self.to_canonical()
502            .vortex_expect("to_canonical failed")
503            .into_varbinview()
504    }
505
506    fn to_extension(&self) -> ExtensionArray {
507        self.to_canonical()
508            .vortex_expect("to_canonical failed")
509            .into_extension()
510    }
511}
512
513impl From<Canonical> for ArrayRef {
514    fn from(value: Canonical) -> Self {
515        match_each_canonical!(value, |arr| arr.into_array())
516    }
517}
518
519/// Execute into [`Canonical`] by running `execute_until` with the [`AnyCanonical`] matcher.
520///
521/// Unlike executing into [`crate::Columnar`], this will fully expand constant arrays into their
522/// canonical form. Callers should prefer to execute into `Columnar` if they are able to optimize
523/// their use for constant arrays.
524impl Executable for Canonical {
525    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
526        let result = array.execute_until::<AnyCanonical>(ctx)?;
527        Ok(result
528            .as_opt::<AnyCanonical>()
529            .map(Canonical::from)
530            .vortex_expect("execute_until::<AnyCanonical> must return a canonical array"))
531    }
532}
533
534/// Recursively execute the array until it reaches canonical form along with its validity.
535///
536/// Callers should prefer to execute into `Columnar` instead of this specific target.
537/// This target is useful when preparing arrays for writing.
538pub struct CanonicalValidity(pub Canonical);
539
540impl Executable for CanonicalValidity {
541    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
542        match array.execute::<Canonical>(ctx)? {
543            n @ Canonical::Null(_) => Ok(CanonicalValidity(n)),
544            Canonical::Bool(b) => {
545                let BoolArrayParts {
546                    bits,
547                    offset,
548                    len,
549                    validity,
550                } = b.into_parts();
551                Ok(CanonicalValidity(Canonical::Bool(
552                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
553                )))
554            }
555            Canonical::Primitive(p) => {
556                let PrimitiveArrayParts {
557                    ptype,
558                    buffer,
559                    validity,
560                } = p.into_parts();
561                Ok(CanonicalValidity(Canonical::Primitive(unsafe {
562                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
563                })))
564            }
565            Canonical::Decimal(d) => {
566                let DecimalArrayParts {
567                    decimal_dtype,
568                    values,
569                    values_type,
570                    validity,
571                } = d.into_parts();
572                Ok(CanonicalValidity(Canonical::Decimal(unsafe {
573                    DecimalArray::new_unchecked_handle(
574                        values,
575                        values_type,
576                        decimal_dtype,
577                        validity.execute(ctx)?,
578                    )
579                })))
580            }
581            Canonical::VarBinView(vbv) => {
582                let VarBinViewArrayParts {
583                    dtype,
584                    buffers,
585                    views,
586                    validity,
587                } = vbv.into_parts();
588                Ok(CanonicalValidity(Canonical::VarBinView(unsafe {
589                    VarBinViewArray::new_handle_unchecked(
590                        views,
591                        buffers,
592                        dtype,
593                        validity.execute(ctx)?,
594                    )
595                })))
596            }
597            Canonical::List(l) => {
598                let zctl = l.is_zero_copy_to_list();
599                let ListViewArrayParts {
600                    elements,
601                    offsets,
602                    sizes,
603                    validity,
604                    ..
605                } = l.into_parts();
606                Ok(CanonicalValidity(Canonical::List(unsafe {
607                    ListViewArray::new_unchecked(elements, offsets, sizes, validity.execute(ctx)?)
608                        .with_zero_copy_to_list(zctl)
609                })))
610            }
611            Canonical::FixedSizeList(fsl) => {
612                let list_size = fsl.list_size();
613                let len = fsl.len();
614                let (elements, validity, _) = fsl.into_parts();
615                Ok(CanonicalValidity(Canonical::FixedSizeList(
616                    FixedSizeListArray::new(elements, list_size, validity.execute(ctx)?, len),
617                )))
618            }
619            Canonical::Struct(st) => {
620                let len = st.len();
621                let StructArrayParts {
622                    struct_fields,
623                    fields,
624                    validity,
625                } = st.into_parts();
626                Ok(CanonicalValidity(Canonical::Struct(unsafe {
627                    StructArray::new_unchecked(fields, struct_fields, len, validity.execute(ctx)?)
628                })))
629            }
630            Canonical::Extension(ext) => Ok(CanonicalValidity(Canonical::Extension(
631                ExtensionArray::new(
632                    ext.ext_dtype().clone(),
633                    ext.storage()
634                        .clone()
635                        .execute::<CanonicalValidity>(ctx)?
636                        .0
637                        .into_array(),
638                ),
639            ))),
640        }
641    }
642}
643
644/// Recursively execute the array until all of its children are canonical.
645///
646/// This method is useful to guarantee that all operators are fully executed,
647/// callers should prefer an execution target that's suitable for their use case instead of this one.
648pub struct RecursiveCanonical(pub Canonical);
649
650impl Executable for RecursiveCanonical {
651    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
652        match array.execute::<Canonical>(ctx)? {
653            n @ Canonical::Null(_) => Ok(RecursiveCanonical(n)),
654            Canonical::Bool(b) => {
655                let BoolArrayParts {
656                    bits,
657                    offset,
658                    len,
659                    validity,
660                } = b.into_parts();
661                Ok(RecursiveCanonical(Canonical::Bool(
662                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
663                )))
664            }
665            Canonical::Primitive(p) => {
666                let PrimitiveArrayParts {
667                    ptype,
668                    buffer,
669                    validity,
670                } = p.into_parts();
671                Ok(RecursiveCanonical(Canonical::Primitive(unsafe {
672                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
673                })))
674            }
675            Canonical::Decimal(d) => {
676                let DecimalArrayParts {
677                    decimal_dtype,
678                    values,
679                    values_type,
680                    validity,
681                } = d.into_parts();
682                Ok(RecursiveCanonical(Canonical::Decimal(unsafe {
683                    DecimalArray::new_unchecked_handle(
684                        values,
685                        values_type,
686                        decimal_dtype,
687                        validity.execute(ctx)?,
688                    )
689                })))
690            }
691            Canonical::VarBinView(vbv) => {
692                let VarBinViewArrayParts {
693                    dtype,
694                    buffers,
695                    views,
696                    validity,
697                } = vbv.into_parts();
698                Ok(RecursiveCanonical(Canonical::VarBinView(unsafe {
699                    VarBinViewArray::new_handle_unchecked(
700                        views,
701                        buffers,
702                        dtype,
703                        validity.execute(ctx)?,
704                    )
705                })))
706            }
707            Canonical::List(l) => {
708                let zctl = l.is_zero_copy_to_list();
709                let ListViewArrayParts {
710                    elements,
711                    offsets,
712                    sizes,
713                    validity,
714                    ..
715                } = l.into_parts();
716                Ok(RecursiveCanonical(Canonical::List(unsafe {
717                    ListViewArray::new_unchecked(
718                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
719                        offsets.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
720                        sizes.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
721                        validity.execute(ctx)?,
722                    )
723                    .with_zero_copy_to_list(zctl)
724                })))
725            }
726            Canonical::FixedSizeList(fsl) => {
727                let list_size = fsl.list_size();
728                let len = fsl.len();
729                let (elements, validity, _) = fsl.into_parts();
730                Ok(RecursiveCanonical(Canonical::FixedSizeList(
731                    FixedSizeListArray::new(
732                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
733                        list_size,
734                        validity.execute(ctx)?,
735                        len,
736                    ),
737                )))
738            }
739            Canonical::Struct(st) => {
740                let len = st.len();
741                let StructArrayParts {
742                    struct_fields,
743                    fields,
744                    validity,
745                } = st.into_parts();
746                let executed_fields = fields
747                    .iter()
748                    .map(|f| Ok(f.clone().execute::<RecursiveCanonical>(ctx)?.0.into_array()))
749                    .collect::<VortexResult<Arc<[_]>>>()?;
750
751                Ok(RecursiveCanonical(Canonical::Struct(unsafe {
752                    StructArray::new_unchecked(
753                        executed_fields,
754                        struct_fields,
755                        len,
756                        validity.execute(ctx)?,
757                    )
758                })))
759            }
760            Canonical::Extension(ext) => Ok(RecursiveCanonical(Canonical::Extension(
761                ExtensionArray::new(
762                    ext.ext_dtype().clone(),
763                    ext.storage()
764                        .clone()
765                        .execute::<RecursiveCanonical>(ctx)?
766                        .0
767                        .into_array(),
768                ),
769            ))),
770        }
771    }
772}
773
774/// Execute a primitive typed array into a buffer of native values, assuming all values are valid.
775///
776/// # Errors
777///
778/// Returns a `VortexError` if the array is not all-valid (has any nulls).
779impl<T: NativePType> Executable for Buffer<T> {
780    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
781        let array = PrimitiveArray::execute(array, ctx)?;
782        vortex_ensure!(
783            array.all_valid()?,
784            "Cannot execute to native buffer: array is not all-valid."
785        );
786        Ok(array.into_buffer())
787    }
788}
789
790/// Execute the array to canonical form and unwrap as a [`PrimitiveArray`].
791///
792/// This will panic if the array's dtype is not primitive.
793impl Executable for PrimitiveArray {
794    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
795        match array.try_into::<PrimitiveVTable>() {
796            Ok(primitive) => Ok(primitive),
797            Err(array) => Ok(Canonical::execute(array, ctx)?.into_primitive()),
798        }
799    }
800}
801
802/// Execute the array to canonical form and unwrap as a [`BoolArray`].
803///
804/// This will panic if the array's dtype is not bool.
805impl Executable for BoolArray {
806    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
807        match array.try_into::<BoolVTable>() {
808            Ok(bool_array) => Ok(bool_array),
809            Err(array) => Ok(Canonical::execute(array, ctx)?.into_bool()),
810        }
811    }
812}
813
814/// Execute the array to a [`BitBuffer`], aka a non-nullable  [`BoolArray`].
815///
816/// This will panic if the array's dtype is not non-nullable bool.
817impl Executable for BitBuffer {
818    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
819        let bool = BoolArray::execute(array, ctx)?;
820        assert!(
821            !bool.dtype().is_nullable(),
822            "bit buffer execute only works with non-nullable bool arrays"
823        );
824        Ok(bool.into_bit_buffer())
825    }
826}
827
828/// Execute the array to canonical form and unwrap as a [`NullArray`].
829///
830/// This will panic if the array's dtype is not null.
831impl Executable for NullArray {
832    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
833        match array.try_into::<NullVTable>() {
834            Ok(null_array) => Ok(null_array),
835            Err(array) => Ok(Canonical::execute(array, ctx)?.into_null()),
836        }
837    }
838}
839
840/// Execute the array to canonical form and unwrap as a [`VarBinViewArray`].
841///
842/// This will panic if the array's dtype is not utf8 or binary.
843impl Executable for VarBinViewArray {
844    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
845        match array.try_into::<VarBinViewVTable>() {
846            Ok(varbinview) => Ok(varbinview),
847            Err(array) => Ok(Canonical::execute(array, ctx)?.into_varbinview()),
848        }
849    }
850}
851
852/// Execute the array to canonical form and unwrap as an [`ExtensionArray`].
853///
854/// This will panic if the array's dtype is not an extension type.
855impl Executable for ExtensionArray {
856    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
857        match array.try_into::<ExtensionVTable>() {
858            Ok(ext_array) => Ok(ext_array),
859            Err(array) => Ok(Canonical::execute(array, ctx)?.into_extension()),
860        }
861    }
862}
863
864/// Execute the array to canonical form and unwrap as a [`DecimalArray`].
865///
866/// This will panic if the array's dtype is not decimal.
867impl Executable for DecimalArray {
868    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
869        match array.try_into::<DecimalVTable>() {
870            Ok(decimal) => Ok(decimal),
871            Err(array) => Ok(Canonical::execute(array, ctx)?.into_decimal()),
872        }
873    }
874}
875
876/// Execute the array to canonical form and unwrap as a [`ListViewArray`].
877///
878/// This will panic if the array's dtype is not list.
879impl Executable for ListViewArray {
880    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
881        match array.try_into::<ListViewVTable>() {
882            Ok(list) => Ok(list),
883            Err(array) => Ok(Canonical::execute(array, ctx)?.into_listview()),
884        }
885    }
886}
887
888/// Execute the array to canonical form and unwrap as a [`FixedSizeListArray`].
889///
890/// This will panic if the array's dtype is not fixed size list.
891impl Executable for FixedSizeListArray {
892    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
893        match array.try_into::<FixedSizeListVTable>() {
894            Ok(fsl) => Ok(fsl),
895            Err(array) => Ok(Canonical::execute(array, ctx)?.into_fixed_size_list()),
896        }
897    }
898}
899
900/// Execute the array to canonical form and unwrap as a [`StructArray`].
901///
902/// This will panic if the array's dtype is not struct.
903impl Executable for StructArray {
904    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
905        match array.try_into::<StructVTable>() {
906            Ok(struct_array) => Ok(struct_array),
907            Err(array) => Ok(Canonical::execute(array, ctx)?.into_struct()),
908        }
909    }
910}
911
912/// A view into a canonical array type.
913#[derive(Debug, Clone)]
914pub enum CanonicalView<'a> {
915    Null(&'a NullArray),
916    Bool(&'a BoolArray),
917    Primitive(&'a PrimitiveArray),
918    Decimal(&'a DecimalArray),
919    VarBinView(&'a VarBinViewArray),
920    List(&'a ListViewArray),
921    FixedSizeList(&'a FixedSizeListArray),
922    Struct(&'a StructArray),
923    Extension(&'a ExtensionArray),
924}
925
926impl From<CanonicalView<'_>> for Canonical {
927    fn from(value: CanonicalView<'_>) -> Self {
928        match value {
929            CanonicalView::Null(a) => Canonical::Null(a.clone()),
930            CanonicalView::Bool(a) => Canonical::Bool(a.clone()),
931            CanonicalView::Primitive(a) => Canonical::Primitive(a.clone()),
932            CanonicalView::Decimal(a) => Canonical::Decimal(a.clone()),
933            CanonicalView::VarBinView(a) => Canonical::VarBinView(a.clone()),
934            CanonicalView::List(a) => Canonical::List(a.clone()),
935            CanonicalView::FixedSizeList(a) => Canonical::FixedSizeList(a.clone()),
936            CanonicalView::Struct(a) => Canonical::Struct(a.clone()),
937            CanonicalView::Extension(a) => Canonical::Extension(a.clone()),
938        }
939    }
940}
941
942impl AsRef<dyn DynArray> for CanonicalView<'_> {
943    fn as_ref(&self) -> &dyn DynArray {
944        match self {
945            CanonicalView::Null(a) => a.as_ref(),
946            CanonicalView::Bool(a) => a.as_ref(),
947            CanonicalView::Primitive(a) => a.as_ref(),
948            CanonicalView::Decimal(a) => a.as_ref(),
949            CanonicalView::VarBinView(a) => a.as_ref(),
950            CanonicalView::List(a) => a.as_ref(),
951            CanonicalView::FixedSizeList(a) => a.as_ref(),
952            CanonicalView::Struct(a) => a.as_ref(),
953            CanonicalView::Extension(a) => a.as_ref(),
954        }
955    }
956}
957
958/// A matcher for any canonical array type.
959pub struct AnyCanonical;
960impl Matcher for AnyCanonical {
961    type Match<'a> = CanonicalView<'a>;
962
963    fn matches(array: &dyn DynArray) -> bool {
964        array.is::<NullVTable>()
965            || array.is::<BoolVTable>()
966            || array.is::<PrimitiveVTable>()
967            || array.is::<DecimalVTable>()
968            || array.is::<StructVTable>()
969            || array.is::<ListViewVTable>()
970            || array.is::<FixedSizeListVTable>()
971            || array.is::<VarBinViewVTable>()
972            || array.is::<ExtensionVTable>()
973    }
974
975    fn try_match<'a>(array: &'a dyn DynArray) -> Option<Self::Match<'a>> {
976        if let Some(a) = array.as_opt::<NullVTable>() {
977            Some(CanonicalView::Null(a))
978        } else if let Some(a) = array.as_opt::<BoolVTable>() {
979            Some(CanonicalView::Bool(a))
980        } else if let Some(a) = array.as_opt::<PrimitiveVTable>() {
981            Some(CanonicalView::Primitive(a))
982        } else if let Some(a) = array.as_opt::<DecimalVTable>() {
983            Some(CanonicalView::Decimal(a))
984        } else if let Some(a) = array.as_opt::<StructVTable>() {
985            Some(CanonicalView::Struct(a))
986        } else if let Some(a) = array.as_opt::<ListViewVTable>() {
987            Some(CanonicalView::List(a))
988        } else if let Some(a) = array.as_opt::<FixedSizeListVTable>() {
989            Some(CanonicalView::FixedSizeList(a))
990        } else if let Some(a) = array.as_opt::<VarBinViewVTable>() {
991            Some(CanonicalView::VarBinView(a))
992        } else {
993            array
994                .as_opt::<ExtensionVTable>()
995                .map(CanonicalView::Extension)
996        }
997    }
998}
999
1000#[cfg(test)]
1001mod test {
1002    use std::sync::Arc;
1003
1004    use arrow_array::Array as ArrowArray;
1005    use arrow_array::ArrayRef as ArrowArrayRef;
1006    use arrow_array::ListArray as ArrowListArray;
1007    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
1008    use arrow_array::StringArray;
1009    use arrow_array::StringViewArray;
1010    use arrow_array::StructArray as ArrowStructArray;
1011    use arrow_array::cast::AsArray;
1012    use arrow_array::types::Int32Type;
1013    use arrow_array::types::Int64Type;
1014    use arrow_array::types::UInt64Type;
1015    use arrow_buffer::NullBufferBuilder;
1016    use arrow_buffer::OffsetBuffer;
1017    use arrow_schema::DataType;
1018    use arrow_schema::Field;
1019    use vortex_buffer::buffer;
1020
1021    use crate::ArrayRef;
1022    use crate::IntoArray;
1023    use crate::arrays::ConstantArray;
1024    use crate::arrow::FromArrowArray;
1025    use crate::arrow::IntoArrowArray;
1026    use crate::canonical::StructArray;
1027
1028    #[test]
1029    fn test_canonicalize_nested_struct() {
1030        // Create a struct array with multiple internal components.
1031        let nested_struct_array = StructArray::from_fields(&[
1032            ("a", buffer![1u64].into_array()),
1033            (
1034                "b",
1035                StructArray::from_fields(&[(
1036                    "inner_a",
1037                    // The nested struct contains a ConstantArray representing the primitive array
1038                    //   [100i64]
1039                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
1040                    // map this to the nearest canonical type (PrimitiveArray).
1041                    ConstantArray::new(100i64, 1).into_array(),
1042                )])
1043                .unwrap()
1044                .into_array(),
1045            ),
1046        ])
1047        .unwrap();
1048
1049        let arrow_struct = nested_struct_array
1050            .into_array()
1051            .into_arrow_preferred()
1052            .unwrap()
1053            .as_any()
1054            .downcast_ref::<ArrowStructArray>()
1055            .cloned()
1056            .unwrap();
1057
1058        assert!(
1059            arrow_struct
1060                .column(0)
1061                .as_any()
1062                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1063                .is_some()
1064        );
1065
1066        let inner_struct = arrow_struct
1067            .column(1)
1068            .clone()
1069            .as_any()
1070            .downcast_ref::<ArrowStructArray>()
1071            .cloned()
1072            .unwrap();
1073
1074        let inner_a = inner_struct
1075            .column(0)
1076            .as_any()
1077            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1078        assert!(inner_a.is_some());
1079
1080        assert_eq!(
1081            inner_a.cloned().unwrap(),
1082            ArrowPrimitiveArray::from_iter([100i64])
1083        );
1084    }
1085
1086    #[test]
1087    fn roundtrip_struct() {
1088        let mut nulls = NullBufferBuilder::new(6);
1089        nulls.append_n_non_nulls(4);
1090        nulls.append_null();
1091        nulls.append_non_null();
1092        let names = Arc::new(StringViewArray::from_iter(vec![
1093            Some("Joseph"),
1094            None,
1095            Some("Angela"),
1096            Some("Mikhail"),
1097            None,
1098            None,
1099        ]));
1100        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1101            Some(25),
1102            Some(31),
1103            None,
1104            Some(57),
1105            None,
1106            None,
1107        ]));
1108
1109        let arrow_struct = ArrowStructArray::new(
1110            vec![
1111                Arc::new(Field::new("name", DataType::Utf8View, true)),
1112                Arc::new(Field::new("age", DataType::Int32, true)),
1113            ]
1114            .into(),
1115            vec![names, ages],
1116            nulls.finish(),
1117        );
1118
1119        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true).unwrap();
1120
1121        assert_eq!(
1122            &arrow_struct,
1123            vortex_struct.into_arrow_preferred().unwrap().as_struct()
1124        );
1125    }
1126
1127    #[test]
1128    fn roundtrip_list() {
1129        let names = Arc::new(StringArray::from_iter(vec![
1130            Some("Joseph"),
1131            Some("Angela"),
1132            Some("Mikhail"),
1133        ]));
1134
1135        let arrow_list = ArrowListArray::new(
1136            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1137            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1138            names,
1139            None,
1140        );
1141        let list_data_type = arrow_list.data_type();
1142
1143        let vortex_list = ArrayRef::from_arrow(&arrow_list, true).unwrap();
1144
1145        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
1146
1147        assert_eq!(
1148            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1149            rt_arrow_list.as_ref()
1150        );
1151    }
1152}