Skip to main content

vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use std::sync::Arc;
7
8use vortex_buffer::Buffer;
9use vortex_dtype::DType;
10use vortex_dtype::NativePType;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_panic;
15
16use crate::Array;
17use crate::ArrayRef;
18use crate::Columnar;
19use crate::Executable;
20use crate::ExecutionCtx;
21use crate::IntoArray;
22use crate::arrays::BoolArray;
23use crate::arrays::BoolArrayParts;
24use crate::arrays::BoolVTable;
25use crate::arrays::DecimalArray;
26use crate::arrays::DecimalArrayParts;
27use crate::arrays::DecimalVTable;
28use crate::arrays::ExtensionArray;
29use crate::arrays::ExtensionVTable;
30use crate::arrays::FixedSizeListArray;
31use crate::arrays::FixedSizeListVTable;
32use crate::arrays::ListViewArray;
33use crate::arrays::ListViewArrayParts;
34use crate::arrays::ListViewRebuildMode;
35use crate::arrays::ListViewVTable;
36use crate::arrays::NullArray;
37use crate::arrays::NullVTable;
38use crate::arrays::PrimitiveArray;
39use crate::arrays::PrimitiveArrayParts;
40use crate::arrays::PrimitiveVTable;
41use crate::arrays::StructArray;
42use crate::arrays::StructArrayParts;
43use crate::arrays::StructVTable;
44use crate::arrays::VarBinViewArray;
45use crate::arrays::VarBinViewArrayParts;
46use crate::arrays::VarBinViewVTable;
47use crate::arrays::constant_canonicalize;
48use crate::builders::builder_with_capacity;
49use crate::matcher::Matcher;
50
51/// An enum capturing the default uncompressed encodings for each [Vortex type](DType).
52///
53/// Any array can be decoded into canonical form via the [`to_canonical`](Array::to_canonical)
54/// trait method. This is the simplest encoding for a type, and will not be compressed but may
55/// contain compressed child arrays.
56///
57/// Canonical form is useful for doing type-specific compute where you need to know that all
58/// elements are laid out decompressed and contiguous in memory.
59///
60/// Each `Canonical` variant has a corresponding [`DType`] variant, with the notable exception of
61/// [`Canonical::VarBinView`], which is the canonical encoding for both [`DType::Utf8`] and
62/// [`DType::Binary`].
63///
64/// # Laziness
65///
66/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
67/// `Struct` type, individual column child arrays may still be compressed. This allows
68/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
69/// never need to be decoded into canonical form at all depending on the compute.
70///
71/// # Arrow interoperability
72///
73/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
74/// zero-copy, and the corresponding Arrow array types can also be built directly.
75///
76/// The full list of canonical types and their equivalent Arrow array types are:
77///
78/// * `NullArray`: [`arrow_array::NullArray`]
79/// * `BoolArray`: [`arrow_array::BooleanArray`]
80/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
81/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
82/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
83/// * `ListViewArray`: [`arrow_array::ListViewArray`]
84/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
85/// * `StructArray`: [`arrow_array::StructArray`]
86///
87/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
88/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
89/// create ambiguity.
90/// Thus, if you receive an Arrow array, compress it using Vortex, and then
91/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
92/// variants to hold the data.
93///
94/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
95/// will correspond to an arrow-rs [`arrow_schema::DataType`].
96///
97/// # Views support
98///
99/// Binary and String views, also known as "German strings" are a better encoding format for
100/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
101/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
102/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
103/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
104/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
105/// from memory.
106///
107/// # For Developers
108///
109/// If you add another variant to this enum, make sure to update `dyn Array::is_canonical`,
110/// and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
111#[derive(Debug, Clone)]
112pub enum Canonical {
113    Null(NullArray),
114    Bool(BoolArray),
115    Primitive(PrimitiveArray),
116    Decimal(DecimalArray),
117    VarBinView(VarBinViewArray),
118    List(ListViewArray),
119    FixedSizeList(FixedSizeListArray),
120    Struct(StructArray),
121    Extension(ExtensionArray),
122}
123
124impl Canonical {
125    // TODO(connor): This can probably be specialized for each of the canonical arrays.
126    /// Create an empty canonical array of the given dtype.
127    pub fn empty(dtype: &DType) -> Canonical {
128        builder_with_capacity(dtype, 0).finish_into_canonical()
129    }
130
131    pub fn len(&self) -> usize {
132        match self {
133            Canonical::Null(c) => c.len(),
134            Canonical::Bool(c) => c.len(),
135            Canonical::Primitive(c) => c.len(),
136            Canonical::Decimal(c) => c.len(),
137            Canonical::VarBinView(c) => c.len(),
138            Canonical::List(c) => c.len(),
139            Canonical::FixedSizeList(c) => c.len(),
140            Canonical::Struct(c) => c.len(),
141            Canonical::Extension(c) => c.len(),
142        }
143    }
144
145    pub fn dtype(&self) -> &DType {
146        match self {
147            Canonical::Null(c) => c.dtype(),
148            Canonical::Bool(c) => c.dtype(),
149            Canonical::Primitive(c) => c.dtype(),
150            Canonical::Decimal(c) => c.dtype(),
151            Canonical::VarBinView(c) => c.dtype(),
152            Canonical::List(c) => c.dtype(),
153            Canonical::FixedSizeList(c) => c.dtype(),
154            Canonical::Struct(c) => c.dtype(),
155            Canonical::Extension(c) => c.dtype(),
156        }
157    }
158
159    pub fn is_empty(&self) -> bool {
160        match self {
161            Canonical::Null(c) => c.is_empty(),
162            Canonical::Bool(c) => c.is_empty(),
163            Canonical::Primitive(c) => c.is_empty(),
164            Canonical::Decimal(c) => c.is_empty(),
165            Canonical::VarBinView(c) => c.is_empty(),
166            Canonical::List(c) => c.is_empty(),
167            Canonical::FixedSizeList(c) => c.is_empty(),
168            Canonical::Struct(c) => c.is_empty(),
169            Canonical::Extension(c) => c.is_empty(),
170        }
171    }
172}
173
174impl Canonical {
175    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
176    ///
177    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
178    /// they can accumulate wasted space after slicing and taking operations.
179    ///
180    /// This operation is very expensive and can result in things like allocations, full-scans
181    /// and copy operations.
182    pub fn compact(&self) -> VortexResult<Canonical> {
183        match self {
184            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
185            Canonical::List(array) => Ok(Canonical::List(
186                array.rebuild(ListViewRebuildMode::TrimElements)?,
187            )),
188            _ => Ok(self.clone()),
189        }
190    }
191}
192
193// Unwrap canonical type back down to specialized type.
194impl Canonical {
195    pub fn as_null(&self) -> &NullArray {
196        if let Canonical::Null(a) = self {
197            a
198        } else {
199            vortex_panic!("Cannot get NullArray from {:?}", &self)
200        }
201    }
202
203    pub fn into_null(self) -> NullArray {
204        if let Canonical::Null(a) = self {
205            a
206        } else {
207            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
208        }
209    }
210
211    pub fn as_bool(&self) -> &BoolArray {
212        if let Canonical::Bool(a) = self {
213            a
214        } else {
215            vortex_panic!("Cannot get BoolArray from {:?}", &self)
216        }
217    }
218
219    pub fn into_bool(self) -> BoolArray {
220        if let Canonical::Bool(a) = self {
221            a
222        } else {
223            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
224        }
225    }
226
227    pub fn as_primitive(&self) -> &PrimitiveArray {
228        if let Canonical::Primitive(a) = self {
229            a
230        } else {
231            vortex_panic!("Cannot get PrimitiveArray from {:?}", &self)
232        }
233    }
234
235    pub fn into_primitive(self) -> PrimitiveArray {
236        if let Canonical::Primitive(a) = self {
237            a
238        } else {
239            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
240        }
241    }
242
243    pub fn as_decimal(&self) -> &DecimalArray {
244        if let Canonical::Decimal(a) = self {
245            a
246        } else {
247            vortex_panic!("Cannot get DecimalArray from {:?}", &self)
248        }
249    }
250
251    pub fn into_decimal(self) -> DecimalArray {
252        if let Canonical::Decimal(a) = self {
253            a
254        } else {
255            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
256        }
257    }
258
259    pub fn as_varbinview(&self) -> &VarBinViewArray {
260        if let Canonical::VarBinView(a) = self {
261            a
262        } else {
263            vortex_panic!("Cannot get VarBinViewArray from {:?}", &self)
264        }
265    }
266
267    pub fn into_varbinview(self) -> VarBinViewArray {
268        if let Canonical::VarBinView(a) = self {
269            a
270        } else {
271            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
272        }
273    }
274
275    pub fn as_listview(&self) -> &ListViewArray {
276        if let Canonical::List(a) = self {
277            a
278        } else {
279            vortex_panic!("Cannot get ListArray from {:?}", &self)
280        }
281    }
282
283    pub fn into_listview(self) -> ListViewArray {
284        if let Canonical::List(a) = self {
285            a
286        } else {
287            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
288        }
289    }
290
291    pub fn as_fixed_size_list(&self) -> &FixedSizeListArray {
292        if let Canonical::FixedSizeList(a) = self {
293            a
294        } else {
295            vortex_panic!("Cannot get FixedSizeListArray from {:?}", &self)
296        }
297    }
298
299    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
300        if let Canonical::FixedSizeList(a) = self {
301            a
302        } else {
303            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
304        }
305    }
306
307    pub fn as_struct(&self) -> &StructArray {
308        if let Canonical::Struct(a) = self {
309            a
310        } else {
311            vortex_panic!("Cannot get StructArray from {:?}", &self)
312        }
313    }
314
315    pub fn into_struct(self) -> StructArray {
316        if let Canonical::Struct(a) = self {
317            a
318        } else {
319            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
320        }
321    }
322
323    pub fn as_extension(&self) -> &ExtensionArray {
324        if let Canonical::Extension(a) = self {
325            a
326        } else {
327            vortex_panic!("Cannot get ExtensionArray from {:?}", &self)
328        }
329    }
330
331    pub fn into_extension(self) -> ExtensionArray {
332        if let Canonical::Extension(a) = self {
333            a
334        } else {
335            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
336        }
337    }
338}
339
340impl AsRef<dyn Array> for Canonical {
341    fn as_ref(&self) -> &(dyn Array + 'static) {
342        match &self {
343            Canonical::Null(a) => a.as_ref(),
344            Canonical::Bool(a) => a.as_ref(),
345            Canonical::Primitive(a) => a.as_ref(),
346            Canonical::Decimal(a) => a.as_ref(),
347            Canonical::Struct(a) => a.as_ref(),
348            Canonical::List(a) => a.as_ref(),
349            Canonical::FixedSizeList(a) => a.as_ref(),
350            Canonical::VarBinView(a) => a.as_ref(),
351            Canonical::Extension(a) => a.as_ref(),
352        }
353    }
354}
355
356impl IntoArray for Canonical {
357    fn into_array(self) -> ArrayRef {
358        match self {
359            Canonical::Null(a) => a.into_array(),
360            Canonical::Bool(a) => a.into_array(),
361            Canonical::Primitive(a) => a.into_array(),
362            Canonical::Decimal(a) => a.into_array(),
363            Canonical::Struct(a) => a.into_array(),
364            Canonical::List(a) => a.into_array(),
365            Canonical::FixedSizeList(a) => a.into_array(),
366            Canonical::VarBinView(a) => a.into_array(),
367            Canonical::Extension(a) => a.into_array(),
368        }
369    }
370}
371
372/// Trait for types that can be converted from an owned type into an owned array variant.
373///
374/// # Canonicalization
375///
376/// This trait has a blanket implementation for all types implementing [ToCanonical].
377pub trait ToCanonical {
378    /// Canonicalize into a [`NullArray`] if the target is [`Null`](DType::Null) typed.
379    fn to_null(&self) -> NullArray;
380
381    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`](DType::Bool) typed.
382    fn to_bool(&self) -> BoolArray;
383
384    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`](DType::Primitive)
385    /// typed.
386    fn to_primitive(&self) -> PrimitiveArray;
387
388    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`](DType::Decimal)
389    /// typed.
390    fn to_decimal(&self) -> DecimalArray;
391
392    /// Canonicalize into a [`StructArray`] if the target is [`Struct`](DType::Struct) typed.
393    fn to_struct(&self) -> StructArray;
394
395    /// Canonicalize into a [`ListViewArray`] if the target is [`List`](DType::List) typed.
396    fn to_listview(&self) -> ListViewArray;
397
398    /// Canonicalize into a [`FixedSizeListArray`] if the target is [`List`](DType::FixedSizeList)
399    /// typed.
400    fn to_fixed_size_list(&self) -> FixedSizeListArray;
401
402    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`](DType::Utf8)
403    /// or [`Binary`](DType::Binary) typed.
404    fn to_varbinview(&self) -> VarBinViewArray;
405
406    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`](DType::Extension)
407    /// typed.
408    fn to_extension(&self) -> ExtensionArray;
409}
410
411// Blanket impl for all Array encodings.
412impl<A: Array + ?Sized> ToCanonical for A {
413    fn to_null(&self) -> NullArray {
414        self.to_canonical()
415            .vortex_expect("to_canonical failed")
416            .into_null()
417    }
418
419    fn to_bool(&self) -> BoolArray {
420        self.to_canonical()
421            .vortex_expect("to_canonical failed")
422            .into_bool()
423    }
424
425    fn to_primitive(&self) -> PrimitiveArray {
426        self.to_canonical()
427            .vortex_expect("to_canonical failed")
428            .into_primitive()
429    }
430
431    fn to_decimal(&self) -> DecimalArray {
432        self.to_canonical()
433            .vortex_expect("to_canonical failed")
434            .into_decimal()
435    }
436
437    fn to_struct(&self) -> StructArray {
438        self.to_canonical()
439            .vortex_expect("to_canonical failed")
440            .into_struct()
441    }
442
443    fn to_listview(&self) -> ListViewArray {
444        self.to_canonical()
445            .vortex_expect("to_canonical failed")
446            .into_listview()
447    }
448
449    fn to_fixed_size_list(&self) -> FixedSizeListArray {
450        self.to_canonical()
451            .vortex_expect("to_canonical failed")
452            .into_fixed_size_list()
453    }
454
455    fn to_varbinview(&self) -> VarBinViewArray {
456        self.to_canonical()
457            .vortex_expect("to_canonical failed")
458            .into_varbinview()
459    }
460
461    fn to_extension(&self) -> ExtensionArray {
462        self.to_canonical()
463            .vortex_expect("to_canonical failed")
464            .into_extension()
465    }
466}
467
468impl From<Canonical> for ArrayRef {
469    fn from(value: Canonical) -> Self {
470        match value {
471            Canonical::Null(a) => a.into_array(),
472            Canonical::Bool(a) => a.into_array(),
473            Canonical::Primitive(a) => a.into_array(),
474            Canonical::Decimal(a) => a.into_array(),
475            Canonical::Struct(a) => a.into_array(),
476            Canonical::List(a) => a.into_array(),
477            Canonical::FixedSizeList(a) => a.into_array(),
478            Canonical::VarBinView(a) => a.into_array(),
479            Canonical::Extension(a) => a.into_array(),
480        }
481    }
482}
483
484/// Recursively execute the array until it reaches canonical form.
485///
486/// Callers should prefer to execute into `Columnar` if they are able to optimize their use for
487/// constant arrays.
488impl Executable for Canonical {
489    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
490        if let Some(canonical) = array.as_opt::<AnyCanonical>() {
491            return Ok(canonical.into());
492        }
493
494        // Invoke execute directly to avoid logging the call in the execution context.
495        Ok(match Columnar::execute(array.clone(), ctx)? {
496            Columnar::Canonical(c) => c,
497            Columnar::Constant(s) => {
498                let canonical = constant_canonicalize(&s)?;
499                canonical
500                    .as_ref()
501                    .statistics()
502                    .inherit_from(array.statistics());
503                canonical
504            }
505        })
506    }
507}
508
509/// Recursively execute the array until it reaches canonical form along with its validity.
510///
511/// Callers should prefer to execute into `Columnar` instead of this specific target.
512/// This target is useful when preparing arrays for writing.
513pub struct CanonicalValidity(pub Canonical);
514
515impl Executable for CanonicalValidity {
516    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
517        match array.execute::<Canonical>(ctx)? {
518            n @ Canonical::Null(_) => Ok(CanonicalValidity(n)),
519            Canonical::Bool(b) => {
520                let BoolArrayParts {
521                    bits,
522                    offset,
523                    len,
524                    validity,
525                } = b.into_parts();
526                Ok(CanonicalValidity(Canonical::Bool(
527                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
528                )))
529            }
530            Canonical::Primitive(p) => {
531                let PrimitiveArrayParts {
532                    ptype,
533                    buffer,
534                    validity,
535                } = p.into_parts();
536                Ok(CanonicalValidity(Canonical::Primitive(unsafe {
537                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
538                })))
539            }
540            Canonical::Decimal(d) => {
541                let DecimalArrayParts {
542                    decimal_dtype,
543                    values,
544                    values_type,
545                    validity,
546                } = d.into_parts();
547                Ok(CanonicalValidity(Canonical::Decimal(unsafe {
548                    DecimalArray::new_unchecked_handle(
549                        values,
550                        values_type,
551                        decimal_dtype,
552                        validity.execute(ctx)?,
553                    )
554                })))
555            }
556            Canonical::VarBinView(vbv) => {
557                let VarBinViewArrayParts {
558                    dtype,
559                    buffers,
560                    views,
561                    validity,
562                } = vbv.into_parts();
563                Ok(CanonicalValidity(Canonical::VarBinView(unsafe {
564                    VarBinViewArray::new_handle_unchecked(
565                        views,
566                        buffers,
567                        dtype,
568                        validity.execute(ctx)?,
569                    )
570                })))
571            }
572            Canonical::List(l) => {
573                let ListViewArrayParts {
574                    elements,
575                    offsets,
576                    sizes,
577                    validity,
578                    ..
579                } = l.into_parts();
580                Ok(CanonicalValidity(Canonical::List(unsafe {
581                    ListViewArray::new_unchecked(elements, offsets, sizes, validity.execute(ctx)?)
582                })))
583            }
584            Canonical::FixedSizeList(fsl) => {
585                let list_size = fsl.list_size();
586                let len = fsl.len();
587                let (elements, validity, _) = fsl.into_parts();
588                Ok(CanonicalValidity(Canonical::FixedSizeList(
589                    FixedSizeListArray::new(elements, list_size, validity.execute(ctx)?, len),
590                )))
591            }
592            Canonical::Struct(st) => {
593                let len = st.len();
594                let StructArrayParts {
595                    struct_fields,
596                    fields,
597                    validity,
598                } = st.into_parts();
599                Ok(CanonicalValidity(Canonical::Struct(unsafe {
600                    StructArray::new_unchecked(fields, struct_fields, len, validity.execute(ctx)?)
601                })))
602            }
603            Canonical::Extension(ext) => Ok(CanonicalValidity(Canonical::Extension(
604                ExtensionArray::new(
605                    ext.ext_dtype().clone(),
606                    ext.storage()
607                        .clone()
608                        .execute::<CanonicalValidity>(ctx)?
609                        .0
610                        .into_array(),
611                ),
612            ))),
613        }
614    }
615}
616
617/// Recursively execute the array until all of its children are canonical.
618///
619/// This method is useful to guarantee that all operators are fully executed,
620/// callers should prefer an execution target that's suitable for their use case instead of this one.
621pub struct RecursiveCanonical(pub Canonical);
622
623impl Executable for RecursiveCanonical {
624    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
625        match array.execute::<Canonical>(ctx)? {
626            n @ Canonical::Null(_) => Ok(RecursiveCanonical(n)),
627            Canonical::Bool(b) => {
628                let BoolArrayParts {
629                    bits,
630                    offset,
631                    len,
632                    validity,
633                } = b.into_parts();
634                Ok(RecursiveCanonical(Canonical::Bool(
635                    BoolArray::try_new_from_handle(bits, offset, len, validity.execute(ctx)?)?,
636                )))
637            }
638            Canonical::Primitive(p) => {
639                let PrimitiveArrayParts {
640                    ptype,
641                    buffer,
642                    validity,
643                } = p.into_parts();
644                Ok(RecursiveCanonical(Canonical::Primitive(unsafe {
645                    PrimitiveArray::new_unchecked_from_handle(buffer, ptype, validity.execute(ctx)?)
646                })))
647            }
648            Canonical::Decimal(d) => {
649                let DecimalArrayParts {
650                    decimal_dtype,
651                    values,
652                    values_type,
653                    validity,
654                } = d.into_parts();
655                Ok(RecursiveCanonical(Canonical::Decimal(unsafe {
656                    DecimalArray::new_unchecked_handle(
657                        values,
658                        values_type,
659                        decimal_dtype,
660                        validity.execute(ctx)?,
661                    )
662                })))
663            }
664            Canonical::VarBinView(vbv) => {
665                let VarBinViewArrayParts {
666                    dtype,
667                    buffers,
668                    views,
669                    validity,
670                } = vbv.into_parts();
671                Ok(RecursiveCanonical(Canonical::VarBinView(unsafe {
672                    VarBinViewArray::new_handle_unchecked(
673                        views,
674                        buffers,
675                        dtype,
676                        validity.execute(ctx)?,
677                    )
678                })))
679            }
680            Canonical::List(l) => {
681                let ListViewArrayParts {
682                    elements,
683                    offsets,
684                    sizes,
685                    validity,
686                    ..
687                } = l.into_parts();
688                Ok(RecursiveCanonical(Canonical::List(unsafe {
689                    ListViewArray::new_unchecked(
690                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
691                        offsets.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
692                        sizes.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
693                        validity.execute(ctx)?,
694                    )
695                })))
696            }
697            Canonical::FixedSizeList(fsl) => {
698                let list_size = fsl.list_size();
699                let len = fsl.len();
700                let (elements, validity, _) = fsl.into_parts();
701                Ok(RecursiveCanonical(Canonical::FixedSizeList(
702                    FixedSizeListArray::new(
703                        elements.execute::<RecursiveCanonical>(ctx)?.0.into_array(),
704                        list_size,
705                        validity.execute(ctx)?,
706                        len,
707                    ),
708                )))
709            }
710            Canonical::Struct(st) => {
711                let len = st.len();
712                let StructArrayParts {
713                    struct_fields,
714                    fields,
715                    validity,
716                } = st.into_parts();
717                let executed_fields = fields
718                    .iter()
719                    .map(|f| Ok(f.clone().execute::<RecursiveCanonical>(ctx)?.0.into_array()))
720                    .collect::<VortexResult<Arc<[_]>>>()?;
721
722                Ok(RecursiveCanonical(Canonical::Struct(unsafe {
723                    StructArray::new_unchecked(
724                        executed_fields,
725                        struct_fields,
726                        len,
727                        validity.execute(ctx)?,
728                    )
729                })))
730            }
731            Canonical::Extension(ext) => Ok(RecursiveCanonical(Canonical::Extension(
732                ExtensionArray::new(
733                    ext.ext_dtype().clone(),
734                    ext.storage()
735                        .clone()
736                        .execute::<RecursiveCanonical>(ctx)?
737                        .0
738                        .into_array(),
739                ),
740            ))),
741        }
742    }
743}
744
745/// Execute a primitive typed array into a buffer of native values, assuming all values are valid.
746///
747/// # Errors
748///
749/// Returns a `VortexError` if the array is not all-valid (has any nulls).
750impl<T: NativePType> Executable for Buffer<T> {
751    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
752        let array = PrimitiveArray::execute(array, ctx)?;
753        vortex_ensure!(
754            array.all_valid()?,
755            "Cannot execute to native buffer: array is not all-valid."
756        );
757        Ok(array.into_buffer())
758    }
759}
760
761/// Execute the array to canonical form and unwrap as a [`PrimitiveArray`].
762///
763/// This will panic if the array's dtype is not primitive.
764impl Executable for PrimitiveArray {
765    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
766        match array.try_into::<PrimitiveVTable>() {
767            Ok(primitive) => Ok(primitive),
768            Err(array) => Ok(Canonical::execute(array, ctx)?.into_primitive()),
769        }
770    }
771}
772
773/// Execute the array to canonical form and unwrap as a [`BoolArray`].
774///
775/// This will panic if the array's dtype is not bool.
776impl Executable for BoolArray {
777    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
778        match array.try_into::<BoolVTable>() {
779            Ok(bool_array) => Ok(bool_array),
780            Err(array) => Ok(Canonical::execute(array, ctx)?.into_bool()),
781        }
782    }
783}
784
785/// Execute the array to canonical form and unwrap as a [`NullArray`].
786///
787/// This will panic if the array's dtype is not null.
788impl Executable for NullArray {
789    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
790        match array.try_into::<NullVTable>() {
791            Ok(null_array) => Ok(null_array),
792            Err(array) => Ok(Canonical::execute(array, ctx)?.into_null()),
793        }
794    }
795}
796
797/// Execute the array to canonical form and unwrap as a [`VarBinViewArray`].
798///
799/// This will panic if the array's dtype is not utf8 or binary.
800impl Executable for VarBinViewArray {
801    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
802        match array.try_into::<VarBinViewVTable>() {
803            Ok(varbinview) => Ok(varbinview),
804            Err(array) => Ok(Canonical::execute(array, ctx)?.into_varbinview()),
805        }
806    }
807}
808
809/// Execute the array to canonical form and unwrap as an [`ExtensionArray`].
810///
811/// This will panic if the array's dtype is not an extension type.
812impl Executable for ExtensionArray {
813    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
814        match array.try_into::<ExtensionVTable>() {
815            Ok(ext_array) => Ok(ext_array),
816            Err(array) => Ok(Canonical::execute(array, ctx)?.into_extension()),
817        }
818    }
819}
820
821/// Execute the array to canonical form and unwrap as a [`DecimalArray`].
822///
823/// This will panic if the array's dtype is not decimal.
824impl Executable for DecimalArray {
825    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
826        match array.try_into::<DecimalVTable>() {
827            Ok(decimal) => Ok(decimal),
828            Err(array) => Ok(Canonical::execute(array, ctx)?.into_decimal()),
829        }
830    }
831}
832
833/// Execute the array to canonical form and unwrap as a [`ListViewArray`].
834///
835/// This will panic if the array's dtype is not list.
836impl Executable for ListViewArray {
837    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
838        match array.try_into::<ListViewVTable>() {
839            Ok(list) => Ok(list),
840            Err(array) => Ok(Canonical::execute(array, ctx)?.into_listview()),
841        }
842    }
843}
844
845/// Execute the array to canonical form and unwrap as a [`FixedSizeListArray`].
846///
847/// This will panic if the array's dtype is not fixed size list.
848impl Executable for FixedSizeListArray {
849    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
850        match array.try_into::<FixedSizeListVTable>() {
851            Ok(fsl) => Ok(fsl),
852            Err(array) => Ok(Canonical::execute(array, ctx)?.into_fixed_size_list()),
853        }
854    }
855}
856
857/// Execute the array to canonical form and unwrap as a [`StructArray`].
858///
859/// This will panic if the array's dtype is not struct.
860impl Executable for StructArray {
861    fn execute(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
862        match array.try_into::<StructVTable>() {
863            Ok(struct_array) => Ok(struct_array),
864            Err(array) => Ok(Canonical::execute(array, ctx)?.into_struct()),
865        }
866    }
867}
868
869/// A view into a canonical array type.
870#[derive(Debug, Clone)]
871pub enum CanonicalView<'a> {
872    Null(&'a NullArray),
873    Bool(&'a BoolArray),
874    Primitive(&'a PrimitiveArray),
875    Decimal(&'a DecimalArray),
876    VarBinView(&'a VarBinViewArray),
877    List(&'a ListViewArray),
878    FixedSizeList(&'a FixedSizeListArray),
879    Struct(&'a StructArray),
880    Extension(&'a ExtensionArray),
881}
882
883impl From<CanonicalView<'_>> for Canonical {
884    fn from(value: CanonicalView<'_>) -> Self {
885        match value {
886            CanonicalView::Null(a) => Canonical::Null(a.clone()),
887            CanonicalView::Bool(a) => Canonical::Bool(a.clone()),
888            CanonicalView::Primitive(a) => Canonical::Primitive(a.clone()),
889            CanonicalView::Decimal(a) => Canonical::Decimal(a.clone()),
890            CanonicalView::VarBinView(a) => Canonical::VarBinView(a.clone()),
891            CanonicalView::List(a) => Canonical::List(a.clone()),
892            CanonicalView::FixedSizeList(a) => Canonical::FixedSizeList(a.clone()),
893            CanonicalView::Struct(a) => Canonical::Struct(a.clone()),
894            CanonicalView::Extension(a) => Canonical::Extension(a.clone()),
895        }
896    }
897}
898
899impl AsRef<dyn Array> for CanonicalView<'_> {
900    fn as_ref(&self) -> &dyn Array {
901        match self {
902            CanonicalView::Null(a) => a.as_ref(),
903            CanonicalView::Bool(a) => a.as_ref(),
904            CanonicalView::Primitive(a) => a.as_ref(),
905            CanonicalView::Decimal(a) => a.as_ref(),
906            CanonicalView::VarBinView(a) => a.as_ref(),
907            CanonicalView::List(a) => a.as_ref(),
908            CanonicalView::FixedSizeList(a) => a.as_ref(),
909            CanonicalView::Struct(a) => a.as_ref(),
910            CanonicalView::Extension(a) => a.as_ref(),
911        }
912    }
913}
914
915/// A matcher for any canonical array type.
916pub struct AnyCanonical;
917impl Matcher for AnyCanonical {
918    type Match<'a> = CanonicalView<'a>;
919
920    fn matches(array: &dyn Array) -> bool {
921        array.is::<NullVTable>()
922            || array.is::<BoolVTable>()
923            || array.is::<PrimitiveVTable>()
924            || array.is::<DecimalVTable>()
925            || array.is::<StructVTable>()
926            || array.is::<ListViewVTable>()
927            || array.is::<FixedSizeListVTable>()
928            || array.is::<VarBinViewVTable>()
929            || array.is::<ExtensionVTable>()
930    }
931
932    fn try_match<'a>(array: &'a dyn Array) -> Option<Self::Match<'a>> {
933        if let Some(a) = array.as_opt::<NullVTable>() {
934            Some(CanonicalView::Null(a))
935        } else if let Some(a) = array.as_opt::<BoolVTable>() {
936            Some(CanonicalView::Bool(a))
937        } else if let Some(a) = array.as_opt::<PrimitiveVTable>() {
938            Some(CanonicalView::Primitive(a))
939        } else if let Some(a) = array.as_opt::<DecimalVTable>() {
940            Some(CanonicalView::Decimal(a))
941        } else if let Some(a) = array.as_opt::<StructVTable>() {
942            Some(CanonicalView::Struct(a))
943        } else if let Some(a) = array.as_opt::<ListViewVTable>() {
944            Some(CanonicalView::List(a))
945        } else if let Some(a) = array.as_opt::<FixedSizeListVTable>() {
946            Some(CanonicalView::FixedSizeList(a))
947        } else if let Some(a) = array.as_opt::<VarBinViewVTable>() {
948            Some(CanonicalView::VarBinView(a))
949        } else {
950            array
951                .as_opt::<ExtensionVTable>()
952                .map(CanonicalView::Extension)
953        }
954    }
955}
956
957#[cfg(test)]
958mod test {
959    use std::sync::Arc;
960
961    use arrow_array::Array as ArrowArray;
962    use arrow_array::ArrayRef as ArrowArrayRef;
963    use arrow_array::ListArray as ArrowListArray;
964    use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
965    use arrow_array::StringArray;
966    use arrow_array::StringViewArray;
967    use arrow_array::StructArray as ArrowStructArray;
968    use arrow_array::cast::AsArray;
969    use arrow_array::types::Int32Type;
970    use arrow_array::types::Int64Type;
971    use arrow_array::types::UInt64Type;
972    use arrow_buffer::NullBufferBuilder;
973    use arrow_buffer::OffsetBuffer;
974    use arrow_schema::DataType;
975    use arrow_schema::Field;
976    use vortex_buffer::buffer;
977
978    use crate::ArrayRef;
979    use crate::IntoArray;
980    use crate::arrays::ConstantArray;
981    use crate::arrays::StructArray;
982    use crate::arrow::FromArrowArray;
983    use crate::arrow::IntoArrowArray;
984
985    #[test]
986    fn test_canonicalize_nested_struct() {
987        // Create a struct array with multiple internal components.
988        let nested_struct_array = StructArray::from_fields(&[
989            ("a", buffer![1u64].into_array()),
990            (
991                "b",
992                StructArray::from_fields(&[(
993                    "inner_a",
994                    // The nested struct contains a ConstantArray representing the primitive array
995                    //   [100i64]
996                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
997                    // map this to the nearest canonical type (PrimitiveArray).
998                    ConstantArray::new(100i64, 1).into_array(),
999                )])
1000                .unwrap()
1001                .into_array(),
1002            ),
1003        ])
1004        .unwrap();
1005
1006        let arrow_struct = nested_struct_array
1007            .into_array()
1008            .into_arrow_preferred()
1009            .unwrap()
1010            .as_any()
1011            .downcast_ref::<ArrowStructArray>()
1012            .cloned()
1013            .unwrap();
1014
1015        assert!(
1016            arrow_struct
1017                .column(0)
1018                .as_any()
1019                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1020                .is_some()
1021        );
1022
1023        let inner_struct = arrow_struct
1024            .column(1)
1025            .clone()
1026            .as_any()
1027            .downcast_ref::<ArrowStructArray>()
1028            .cloned()
1029            .unwrap();
1030
1031        let inner_a = inner_struct
1032            .column(0)
1033            .as_any()
1034            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1035        assert!(inner_a.is_some());
1036
1037        assert_eq!(
1038            inner_a.cloned().unwrap(),
1039            ArrowPrimitiveArray::from_iter([100i64])
1040        );
1041    }
1042
1043    #[test]
1044    fn roundtrip_struct() {
1045        let mut nulls = NullBufferBuilder::new(6);
1046        nulls.append_n_non_nulls(4);
1047        nulls.append_null();
1048        nulls.append_non_null();
1049        let names = Arc::new(StringViewArray::from_iter(vec![
1050            Some("Joseph"),
1051            None,
1052            Some("Angela"),
1053            Some("Mikhail"),
1054            None,
1055            None,
1056        ]));
1057        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1058            Some(25),
1059            Some(31),
1060            None,
1061            Some(57),
1062            None,
1063            None,
1064        ]));
1065
1066        let arrow_struct = ArrowStructArray::new(
1067            vec![
1068                Arc::new(Field::new("name", DataType::Utf8View, true)),
1069                Arc::new(Field::new("age", DataType::Int32, true)),
1070            ]
1071            .into(),
1072            vec![names, ages],
1073            nulls.finish(),
1074        );
1075
1076        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true).unwrap();
1077
1078        assert_eq!(
1079            &arrow_struct,
1080            vortex_struct.into_arrow_preferred().unwrap().as_struct()
1081        );
1082    }
1083
1084    #[test]
1085    fn roundtrip_list() {
1086        let names = Arc::new(StringArray::from_iter(vec![
1087            Some("Joseph"),
1088            Some("Angela"),
1089            Some("Mikhail"),
1090        ]));
1091
1092        let arrow_list = ArrowListArray::new(
1093            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1094            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1095            names,
1096            None,
1097        );
1098        let list_data_type = arrow_list.data_type();
1099
1100        let vortex_list = ArrayRef::from_arrow(&arrow_list, true).unwrap();
1101
1102        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
1103
1104        assert_eq!(
1105            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1106            rt_arrow_list.as_ref()
1107        );
1108    }
1109}