vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::{VortexResult, vortex_panic};
8
9use crate::arrays::{
10    BoolArray, DecimalArray, ExtensionArray, FixedSizeListArray, ListArray, NullArray,
11    PrimitiveArray, StructArray, VarBinViewArray,
12};
13use crate::builders::builder_with_capacity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// An enum capturing the default uncompressed encodings for each [Vortex type][DType].
17///
18/// Any array can be decoded into canonical form via the [`to_canonical`][Array::to_canonical]
19/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20/// contain compressed child arrays.
21///
22/// Canonical form is useful for doing type-specific compute where you need to know that all
23/// elements are laid out decompressed and contiguous in memory.
24///
25/// # Laziness
26///
27/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
28/// `Struct` type, individual column child arrays may still be compressed. This allows
29/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
30/// never need to be decoded into canonical form at all depending on the compute.
31///
32/// # Arrow interoperability
33///
34/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
35/// zero-copy, and the corresponding Arrow array types can also be built directly.
36///
37/// The full list of canonical types and their equivalent Arrow array types are:
38///
39/// * `NullArray`: [`arrow_array::NullArray`]
40/// * `BoolArray`: [`arrow_array::BooleanArray`]
41/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
42/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
43/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
44/// * `ListArray`: [`arrow_array::ListArray`]
45/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
46/// * `StructArray`: [`arrow_array::StructArray`]
47///
48/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
49/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
50/// create ambiguity.
51/// Thus, if you receive an Arrow array, compress it using Vortex, and then
52/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
53/// variants to hold the data.
54///
55/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
56/// will correspond to an arrow-rs [`arrow_schema::DataType`].
57///
58/// # Views support
59///
60/// Binary and String views, also known as "German strings" are a better encoding format for
61/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
62/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
63/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
64/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
65/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
66/// from memory.
67///
68/// # For Developers
69///
70/// If you add another variant to this enum, make sure to update [`Array::is_canonical`],
71/// [`ArrayRegistry::canonical_only`], and the fuzzer in `fuzz/fuzz_targets/array_ops.rs`.
72///
73/// [`ArrayRegistry::canonical_only`]: crate::ArrayRegistry::canonical_only
74#[derive(Debug, Clone)]
75pub enum Canonical {
76    Null(NullArray),
77    Bool(BoolArray),
78    Primitive(PrimitiveArray),
79    Decimal(DecimalArray),
80    VarBinView(VarBinViewArray),
81    // TODO(connor)[ListView]: Convert the canonical encoding of `List` to `ListViewArray`.
82    List(ListArray),
83    FixedSizeList(FixedSizeListArray),
84    Struct(StructArray),
85    Extension(ExtensionArray),
86}
87
88impl Canonical {
89    /// Create an empty canonical array of the given dtype.
90    pub fn empty(dtype: &DType) -> Canonical {
91        builder_with_capacity(dtype, 0).finish_into_canonical()
92    }
93}
94
95impl Canonical {
96    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
97    ///
98    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
99    /// they can accumulate wasted space after slicing and taking operations.
100    ///
101    /// This operation is very expensive and can result in things like allocations, full-scans
102    /// and copy operations.
103    pub fn compact(&self) -> VortexResult<Canonical> {
104        match self {
105            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
106            Canonical::List(array) => Ok(Canonical::List(array.reset_offsets()?)),
107            _ => Ok(self.clone()),
108        }
109    }
110}
111
112// Unwrap canonical type back down to specialized type.
113impl Canonical {
114    pub fn into_null(self) -> NullArray {
115        if let Canonical::Null(a) = self {
116            a
117        } else {
118            vortex_panic!("Cannot unwrap NullArray from {:?}", &self)
119        }
120    }
121
122    pub fn into_bool(self) -> BoolArray {
123        if let Canonical::Bool(a) = self {
124            a
125        } else {
126            vortex_panic!("Cannot unwrap BoolArray from {:?}", &self)
127        }
128    }
129
130    pub fn into_primitive(self) -> PrimitiveArray {
131        if let Canonical::Primitive(a) = self {
132            a
133        } else {
134            vortex_panic!("Cannot unwrap PrimitiveArray from {:?}", &self)
135        }
136    }
137
138    pub fn into_decimal(self) -> DecimalArray {
139        if let Canonical::Decimal(a) = self {
140            a
141        } else {
142            vortex_panic!("Cannot unwrap DecimalArray from {:?}", &self)
143        }
144    }
145
146    pub fn into_varbinview(self) -> VarBinViewArray {
147        if let Canonical::VarBinView(a) = self {
148            a
149        } else {
150            vortex_panic!("Cannot unwrap VarBinViewArray from {:?}", &self)
151        }
152    }
153
154    pub fn into_list(self) -> ListArray {
155        if let Canonical::List(a) = self {
156            a
157        } else {
158            vortex_panic!("Cannot unwrap ListArray from {:?}", &self)
159        }
160    }
161
162    pub fn into_fixed_size_list(self) -> FixedSizeListArray {
163        if let Canonical::FixedSizeList(a) = self {
164            a
165        } else {
166            vortex_panic!("Cannot unwrap FixedSizeListArray from {:?}", &self)
167        }
168    }
169
170    pub fn into_struct(self) -> StructArray {
171        if let Canonical::Struct(a) = self {
172            a
173        } else {
174            vortex_panic!("Cannot unwrap StructArray from {:?}", &self)
175        }
176    }
177
178    pub fn into_extension(self) -> ExtensionArray {
179        if let Canonical::Extension(a) = self {
180            a
181        } else {
182            vortex_panic!("Cannot unwrap ExtensionArray from {:?}", &self)
183        }
184    }
185}
186
187impl AsRef<dyn Array> for Canonical {
188    fn as_ref(&self) -> &(dyn Array + 'static) {
189        match &self {
190            Canonical::Null(a) => a.as_ref(),
191            Canonical::Bool(a) => a.as_ref(),
192            Canonical::Primitive(a) => a.as_ref(),
193            Canonical::Decimal(a) => a.as_ref(),
194            Canonical::Struct(a) => a.as_ref(),
195            Canonical::List(a) => a.as_ref(),
196            Canonical::FixedSizeList(a) => a.as_ref(),
197            Canonical::VarBinView(a) => a.as_ref(),
198            Canonical::Extension(a) => a.as_ref(),
199        }
200    }
201}
202
203impl IntoArray for Canonical {
204    fn into_array(self) -> ArrayRef {
205        match self {
206            Canonical::Null(a) => a.into_array(),
207            Canonical::Bool(a) => a.into_array(),
208            Canonical::Primitive(a) => a.into_array(),
209            Canonical::Decimal(a) => a.into_array(),
210            Canonical::Struct(a) => a.into_array(),
211            Canonical::List(a) => a.into_array(),
212            Canonical::FixedSizeList(a) => a.into_array(),
213            Canonical::VarBinView(a) => a.into_array(),
214            Canonical::Extension(a) => a.into_array(),
215        }
216    }
217}
218
219/// Trait for types that can be converted from an owned type into an owned array variant.
220///
221/// # Canonicalization
222///
223/// This trait has a blanket implementation for all types implementing [ToCanonical].
224pub trait ToCanonical {
225    /// Canonicalize into a [`NullArray`] if the target is [`Null`][DType::Null] typed.
226    fn to_null(&self) -> NullArray;
227
228    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`][DType::Bool] typed.
229    fn to_bool(&self) -> BoolArray;
230
231    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`][DType::Primitive]
232    /// typed.
233    fn to_primitive(&self) -> PrimitiveArray;
234
235    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`][DType::Decimal]
236    /// typed.
237    fn to_decimal(&self) -> DecimalArray;
238
239    /// Canonicalize into a [`StructArray`] if the target is [`Struct`][DType::Struct] typed.
240    fn to_struct(&self) -> StructArray;
241
242    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
243    fn to_list(&self) -> ListArray;
244
245    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
246    fn to_fixed_size_list(&self) -> FixedSizeListArray;
247
248    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`][DType::Utf8]
249    /// or [`Binary`][DType::Binary] typed.
250    fn to_varbinview(&self) -> VarBinViewArray;
251
252    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`][DType::Extension]
253    /// typed.
254    fn to_extension(&self) -> ExtensionArray;
255}
256
257// Blanket impl for all Array encodings.
258impl<A: Array + ?Sized> ToCanonical for A {
259    fn to_null(&self) -> NullArray {
260        self.to_canonical().into_null()
261    }
262
263    fn to_bool(&self) -> BoolArray {
264        self.to_canonical().into_bool()
265    }
266
267    fn to_primitive(&self) -> PrimitiveArray {
268        self.to_canonical().into_primitive()
269    }
270
271    fn to_decimal(&self) -> DecimalArray {
272        self.to_canonical().into_decimal()
273    }
274
275    fn to_struct(&self) -> StructArray {
276        self.to_canonical().into_struct()
277    }
278
279    fn to_list(&self) -> ListArray {
280        self.to_canonical().into_list()
281    }
282
283    fn to_fixed_size_list(&self) -> FixedSizeListArray {
284        self.to_canonical().into_fixed_size_list()
285    }
286
287    fn to_varbinview(&self) -> VarBinViewArray {
288        self.to_canonical().into_varbinview()
289    }
290
291    fn to_extension(&self) -> ExtensionArray {
292        self.to_canonical().into_extension()
293    }
294}
295
296impl From<Canonical> for ArrayRef {
297    fn from(value: Canonical) -> Self {
298        match value {
299            Canonical::Null(a) => a.into_array(),
300            Canonical::Bool(a) => a.into_array(),
301            Canonical::Primitive(a) => a.into_array(),
302            Canonical::Decimal(a) => a.into_array(),
303            Canonical::Struct(a) => a.into_array(),
304            Canonical::List(a) => a.into_array(),
305            Canonical::FixedSizeList(a) => a.into_array(),
306            Canonical::VarBinView(a) => a.into_array(),
307            Canonical::Extension(a) => a.into_array(),
308        }
309    }
310}
311
312#[cfg(test)]
313mod test {
314    use std::sync::Arc;
315
316    use arrow_array::cast::AsArray;
317    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
318    use arrow_array::{
319        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
320        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
321        StructArray as ArrowStructArray,
322    };
323    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
324    use arrow_schema::{DataType, Field};
325    use vortex_buffer::buffer;
326
327    use crate::arrays::{ConstantArray, StructArray};
328    use crate::arrow::{FromArrowArray, IntoArrowArray};
329    use crate::{ArrayRef, IntoArray};
330
331    #[test]
332    fn test_canonicalize_nested_struct() {
333        // Create a struct array with multiple internal components.
334        let nested_struct_array = StructArray::from_fields(&[
335            ("a", buffer![1u64].into_array()),
336            (
337                "b",
338                StructArray::from_fields(&[(
339                    "inner_a",
340                    // The nested struct contains a ConstantArray representing the primitive array
341                    //   [100i64]
342                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
343                    // map this to the nearest canonical type (PrimitiveArray).
344                    ConstantArray::new(100i64, 1).into_array(),
345                )])
346                .unwrap()
347                .into_array(),
348            ),
349        ])
350        .unwrap();
351
352        let arrow_struct = nested_struct_array
353            .into_array()
354            .into_arrow_preferred()
355            .unwrap()
356            .as_any()
357            .downcast_ref::<ArrowStructArray>()
358            .cloned()
359            .unwrap();
360
361        assert!(
362            arrow_struct
363                .column(0)
364                .as_any()
365                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
366                .is_some()
367        );
368
369        let inner_struct = arrow_struct
370            .column(1)
371            .clone()
372            .as_any()
373            .downcast_ref::<ArrowStructArray>()
374            .cloned()
375            .unwrap();
376
377        let inner_a = inner_struct
378            .column(0)
379            .as_any()
380            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
381        assert!(inner_a.is_some());
382
383        assert_eq!(
384            inner_a.cloned().unwrap(),
385            ArrowPrimitiveArray::from_iter([100i64])
386        );
387    }
388
389    #[test]
390    fn roundtrip_struct() {
391        let mut nulls = NullBufferBuilder::new(6);
392        nulls.append_n_non_nulls(4);
393        nulls.append_null();
394        nulls.append_non_null();
395        let names = Arc::new(StringViewArray::from_iter(vec![
396            Some("Joseph"),
397            None,
398            Some("Angela"),
399            Some("Mikhail"),
400            None,
401            None,
402        ]));
403        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
404            Some(25),
405            Some(31),
406            None,
407            Some(57),
408            None,
409            None,
410        ]));
411
412        let arrow_struct = ArrowStructArray::new(
413            vec![
414                Arc::new(Field::new("name", DataType::Utf8View, true)),
415                Arc::new(Field::new("age", DataType::Int32, true)),
416            ]
417            .into(),
418            vec![names, ages],
419            nulls.finish(),
420        );
421
422        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
423
424        assert_eq!(
425            &arrow_struct,
426            vortex_struct.into_arrow_preferred().unwrap().as_struct()
427        );
428    }
429
430    #[test]
431    fn roundtrip_list() {
432        let names = Arc::new(StringArray::from_iter(vec![
433            Some("Joseph"),
434            Some("Angela"),
435            Some("Mikhail"),
436        ]));
437
438        let arrow_list = ArrowListArray::new(
439            Arc::new(Field::new_list_field(DataType::Utf8, true)),
440            OffsetBuffer::from_lengths(vec![0, 2, 1]),
441            names,
442            None,
443        );
444        let list_data_type = arrow_list.data_type();
445
446        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
447
448        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
449
450        assert_eq!(
451            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
452            rt_arrow_list.as_ref()
453        );
454    }
455}