vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::{VortexExpect, VortexResult, vortex_bail};
8
9use crate::arrays::{
10    BoolArray, DecimalArray, ExtensionArray, FixedSizeListArray, ListArray, NullArray,
11    PrimitiveArray, StructArray, VarBinViewArray,
12};
13use crate::builders::builder_with_capacity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// An enum capturing the default uncompressed encodings for each [Vortex type][DType].
17///
18/// Any array can be decoded into canonical form via the [`to_canonical`][Array::to_canonical]
19/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20/// contain compressed child arrays.
21///
22/// Canonical form is useful for doing type-specific compute where you need to know that all
23/// elements are laid out decompressed and contiguous in memory.
24///
25/// # Laziness
26///
27/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
28/// `Struct` type, individual column child arrays may still be compressed. This allows
29/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
30/// never need to be decoded into canonical form at all depending on the compute.
31///
32/// # Arrow interoperability
33///
34/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
35/// zero-copy, and the corresponding Arrow array types can also be built directly.
36///
37/// The full list of canonical types and their equivalent Arrow array types are:
38///
39/// * `NullArray`: [`arrow_array::NullArray`]
40/// * `BoolArray`: [`arrow_array::BooleanArray`]
41/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
42/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
43/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
44/// * `ListArray`: [`arrow_array::ListArray`]
45/// * `FixedSizeListArray`: [`arrow_array::FixedSizeListArray`]
46/// * `StructArray`: [`arrow_array::StructArray`]
47///
48/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
49/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
50/// create ambiguity.
51/// Thus, if you receive an Arrow array, compress it using Vortex, and then
52/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
53/// variants to hold the data.
54///
55/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
56/// will correspond to an arrow-rs [`arrow_schema::DataType`].
57///
58/// # Views support
59///
60/// Binary and String views, also known as "German strings" are a better encoding format for
61/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
62/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
63/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
64/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
65/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
66/// from memory.
67#[derive(Debug, Clone)]
68pub enum Canonical {
69    Null(NullArray),
70    Bool(BoolArray),
71    Primitive(PrimitiveArray),
72    Decimal(DecimalArray),
73    VarBinView(VarBinViewArray),
74    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
75    List(ListArray),
76    FixedSizeList(FixedSizeListArray),
77    Struct(StructArray),
78    Extension(ExtensionArray),
79}
80
81impl Canonical {
82    /// Create an empty canonical array of the given dtype.
83    pub fn empty(dtype: &DType) -> Canonical {
84        builder_with_capacity(dtype, 0)
85            .finish()
86            .to_canonical()
87            .vortex_expect("cannot fail to convert an empty array to canonical")
88    }
89}
90
91impl Canonical {
92    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
93    ///
94    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
95    /// they can accumulate wasted space after slicing and taking operations.
96    ///
97    /// This operation is very expensive and can result in things like allocations, full-scans
98    /// and copy operations.
99    pub fn compact(&self) -> VortexResult<Canonical> {
100        match self {
101            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
102            Canonical::List(array) => Ok(Canonical::List(array.reset_offsets()?)),
103            _ => Ok(self.clone()),
104        }
105    }
106}
107
108// Unwrap canonical type back down to specialized type.
109impl Canonical {
110    pub fn into_null(self) -> VortexResult<NullArray> {
111        if let Canonical::Null(a) = self {
112            Ok(a)
113        } else {
114            vortex_bail!("Cannot unwrap NullArray from {:?}", &self)
115        }
116    }
117
118    pub fn into_bool(self) -> VortexResult<BoolArray> {
119        if let Canonical::Bool(a) = self {
120            Ok(a)
121        } else {
122            vortex_bail!("Cannot unwrap BoolArray from {:?}", &self)
123        }
124    }
125
126    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
127        if let Canonical::Primitive(a) = self {
128            Ok(a)
129        } else {
130            vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self)
131        }
132    }
133
134    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
135        if let Canonical::Decimal(a) = self {
136            Ok(a)
137        } else {
138            vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self)
139        }
140    }
141
142    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
143        if let Canonical::VarBinView(a) = self {
144            Ok(a)
145        } else {
146            vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self)
147        }
148    }
149
150    pub fn into_list(self) -> VortexResult<ListArray> {
151        if let Canonical::List(a) = self {
152            Ok(a)
153        } else {
154            vortex_bail!("Cannot unwrap ListArray from {:?}", &self)
155        }
156    }
157
158    pub fn into_fixed_size_list(self) -> VortexResult<FixedSizeListArray> {
159        if let Canonical::FixedSizeList(a) = self {
160            Ok(a)
161        } else {
162            vortex_bail!("Cannot unwrap FixedSizeListArray from {:?}", &self)
163        }
164    }
165
166    pub fn into_struct(self) -> VortexResult<StructArray> {
167        if let Canonical::Struct(a) = self {
168            Ok(a)
169        } else {
170            vortex_bail!("Cannot unwrap StructArray from {:?}", &self)
171        }
172    }
173
174    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
175        if let Canonical::Extension(a) = self {
176            Ok(a)
177        } else {
178            vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self)
179        }
180    }
181}
182
183impl AsRef<dyn Array> for Canonical {
184    fn as_ref(&self) -> &(dyn Array + 'static) {
185        match &self {
186            Canonical::Null(a) => a.as_ref(),
187            Canonical::Bool(a) => a.as_ref(),
188            Canonical::Primitive(a) => a.as_ref(),
189            Canonical::Decimal(a) => a.as_ref(),
190            Canonical::Struct(a) => a.as_ref(),
191            Canonical::List(a) => a.as_ref(),
192            Canonical::FixedSizeList(a) => a.as_ref(),
193            Canonical::VarBinView(a) => a.as_ref(),
194            Canonical::Extension(a) => a.as_ref(),
195        }
196    }
197}
198
199impl IntoArray for Canonical {
200    fn into_array(self) -> ArrayRef {
201        match self {
202            Canonical::Null(a) => a.into_array(),
203            Canonical::Bool(a) => a.into_array(),
204            Canonical::Primitive(a) => a.into_array(),
205            Canonical::Decimal(a) => a.into_array(),
206            Canonical::Struct(a) => a.into_array(),
207            Canonical::List(a) => a.into_array(),
208            Canonical::FixedSizeList(a) => a.into_array(),
209            Canonical::VarBinView(a) => a.into_array(),
210            Canonical::Extension(a) => a.into_array(),
211        }
212    }
213}
214
215/// Trait for types that can be converted from an owned type into an owned array variant.
216///
217/// # Canonicalization
218///
219/// This trait has a blanket implementation for all types implementing [ToCanonical].
220pub trait ToCanonical {
221    /// Canonicalize into a [`NullArray`] if the target is [`Null`][DType::Null] typed.
222    fn to_null(&self) -> VortexResult<NullArray>;
223
224    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`][DType::Bool] typed.
225    fn to_bool(&self) -> VortexResult<BoolArray>;
226
227    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`][DType::Primitive]
228    /// typed.
229    fn to_primitive(&self) -> VortexResult<PrimitiveArray>;
230
231    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`][DType::Decimal]
232    /// typed.
233    fn to_decimal(&self) -> VortexResult<DecimalArray>;
234
235    /// Canonicalize into a [`StructArray`] if the target is [`Struct`][DType::Struct] typed.
236    fn to_struct(&self) -> VortexResult<StructArray>;
237
238    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
239    fn to_list(&self) -> VortexResult<ListArray>;
240
241    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
242    fn to_fixed_size_list(&self) -> VortexResult<FixedSizeListArray>;
243
244    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`][DType::Utf8]
245    /// or [`Binary`][DType::Binary] typed.
246    fn to_varbinview(&self) -> VortexResult<VarBinViewArray>;
247
248    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`][DType::Extension]
249    /// typed.
250    fn to_extension(&self) -> VortexResult<ExtensionArray>;
251}
252
253// Blanket impl for all Array encodings.
254impl<A: Array + ?Sized> ToCanonical for A {
255    fn to_null(&self) -> VortexResult<NullArray> {
256        self.to_canonical()?.into_null()
257    }
258
259    fn to_bool(&self) -> VortexResult<BoolArray> {
260        self.to_canonical()?.into_bool()
261    }
262
263    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
264        self.to_canonical()?.into_primitive()
265    }
266
267    fn to_decimal(&self) -> VortexResult<DecimalArray> {
268        self.to_canonical()?.into_decimal()
269    }
270
271    fn to_struct(&self) -> VortexResult<StructArray> {
272        self.to_canonical()?.into_struct()
273    }
274
275    fn to_list(&self) -> VortexResult<ListArray> {
276        self.to_canonical()?.into_list()
277    }
278
279    fn to_fixed_size_list(&self) -> VortexResult<FixedSizeListArray> {
280        self.to_canonical()?.into_fixed_size_list()
281    }
282
283    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
284        self.to_canonical()?.into_varbinview()
285    }
286
287    fn to_extension(&self) -> VortexResult<ExtensionArray> {
288        self.to_canonical()?.into_extension()
289    }
290}
291
292impl From<Canonical> for ArrayRef {
293    fn from(value: Canonical) -> Self {
294        match value {
295            Canonical::Null(a) => a.into_array(),
296            Canonical::Bool(a) => a.into_array(),
297            Canonical::Primitive(a) => a.into_array(),
298            Canonical::Decimal(a) => a.into_array(),
299            Canonical::Struct(a) => a.into_array(),
300            Canonical::List(a) => a.into_array(),
301            Canonical::FixedSizeList(a) => a.into_array(),
302            Canonical::VarBinView(a) => a.into_array(),
303            Canonical::Extension(a) => a.into_array(),
304        }
305    }
306}
307
308#[cfg(test)]
309mod test {
310    use std::sync::Arc;
311
312    use arrow_array::cast::AsArray;
313    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
314    use arrow_array::{
315        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
316        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
317        StructArray as ArrowStructArray,
318    };
319    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
320    use arrow_schema::{DataType, Field};
321    use vortex_buffer::buffer;
322
323    use crate::arrays::{ConstantArray, StructArray};
324    use crate::arrow::{FromArrowArray, IntoArrowArray};
325    use crate::{ArrayRef, IntoArray};
326
327    #[test]
328    fn test_canonicalize_nested_struct() {
329        // Create a struct array with multiple internal components.
330        let nested_struct_array = StructArray::from_fields(&[
331            ("a", buffer![1u64].into_array()),
332            (
333                "b",
334                StructArray::from_fields(&[(
335                    "inner_a",
336                    // The nested struct contains a ConstantArray representing the primitive array
337                    //   [100i64]
338                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
339                    // map this to the nearest canonical type (PrimitiveArray).
340                    ConstantArray::new(100i64, 1).into_array(),
341                )])
342                .unwrap()
343                .into_array(),
344            ),
345        ])
346        .unwrap();
347
348        let arrow_struct = nested_struct_array
349            .into_array()
350            .into_arrow_preferred()
351            .unwrap()
352            .as_any()
353            .downcast_ref::<ArrowStructArray>()
354            .cloned()
355            .unwrap();
356
357        assert!(
358            arrow_struct
359                .column(0)
360                .as_any()
361                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
362                .is_some()
363        );
364
365        let inner_struct = arrow_struct
366            .column(1)
367            .clone()
368            .as_any()
369            .downcast_ref::<ArrowStructArray>()
370            .cloned()
371            .unwrap();
372
373        let inner_a = inner_struct
374            .column(0)
375            .as_any()
376            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
377        assert!(inner_a.is_some());
378
379        assert_eq!(
380            inner_a.cloned().unwrap(),
381            ArrowPrimitiveArray::from_iter([100i64]),
382        );
383    }
384
385    #[test]
386    fn roundtrip_struct() {
387        let mut nulls = NullBufferBuilder::new(6);
388        nulls.append_n_non_nulls(4);
389        nulls.append_null();
390        nulls.append_non_null();
391        let names = Arc::new(StringViewArray::from_iter(vec![
392            Some("Joseph"),
393            None,
394            Some("Angela"),
395            Some("Mikhail"),
396            None,
397            None,
398        ]));
399        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
400            Some(25),
401            Some(31),
402            None,
403            Some(57),
404            None,
405            None,
406        ]));
407
408        let arrow_struct = ArrowStructArray::new(
409            vec![
410                Arc::new(Field::new("name", DataType::Utf8View, true)),
411                Arc::new(Field::new("age", DataType::Int32, true)),
412            ]
413            .into(),
414            vec![names, ages],
415            nulls.finish(),
416        );
417
418        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
419
420        assert_eq!(
421            &arrow_struct,
422            vortex_struct.into_arrow_preferred().unwrap().as_struct()
423        );
424    }
425
426    #[test]
427    fn roundtrip_list() {
428        let names = Arc::new(StringArray::from_iter(vec![
429            Some("Joseph"),
430            Some("Angela"),
431            Some("Mikhail"),
432        ]));
433
434        let arrow_list = ArrowListArray::new(
435            Arc::new(Field::new_list_field(DataType::Utf8, true)),
436            OffsetBuffer::from_lengths(vec![0, 2, 1]),
437            names,
438            None,
439        );
440        let list_data_type = arrow_list.data_type();
441
442        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
443
444        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
445
446        assert_eq!(
447            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
448            rt_arrow_list.as_ref()
449        );
450    }
451}