vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::{VortexExpect, VortexResult, vortex_bail};
8
9use crate::arrays::{
10    BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray,
11    VarBinViewArray,
12};
13use crate::builders::builder_with_capacity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// An enum capturing the default uncompressed encodings for each [Vortex type][DType].
17///
18/// Any array can be decoded into canonical form via the [`to_canonical`][Array::to_canonical]
19/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20/// contain compressed child arrays.
21///
22/// Canonical form is useful for doing type-specific compute where you need to know that all
23/// elements are laid out decompressed and contiguous in memory.
24///
25/// # Laziness
26///
27/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
28/// `Struct` type, individual column child arrays may still be compressed. This allows
29/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
30/// never need to be decoded into canonical form at all depending on the compute.
31///
32/// # Arrow interoperability
33///
34/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
35/// zero-copy, and the corresponding Arrow array types can also be built directly.
36///
37/// The full list of canonical types and their equivalent Arrow array types are:
38///
39/// * `NullArray`: [`arrow_array::NullArray`]
40/// * `BoolArray`: [`arrow_array::BooleanArray`]
41/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
42/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
43/// * `StructArray`: [`arrow_array::StructArray`]
44/// * `ListArray`: [`arrow_array::ListArray`]
45/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
46///
47/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
48/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
49/// create ambiguity.
50/// Thus, if you receive an Arrow array, compress it using Vortex, and then
51/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
52/// variants to hold the data.
53///
54/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
55/// will correspond to an arrow-rs [`arrow_schema::DataType`].
56///
57/// # Views support
58///
59/// Binary and String views, also known as "German strings" are a better encoding format for
60/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
61/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
62/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
63/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
64/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
65/// from memory.
66#[derive(Debug, Clone)]
67pub enum Canonical {
68    Null(NullArray),
69    Bool(BoolArray),
70    Primitive(PrimitiveArray),
71    Decimal(DecimalArray),
72    Struct(StructArray),
73    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
74    List(ListArray),
75    // TODO(connor)[FixedSizeList]
76    VarBinView(VarBinViewArray),
77    Extension(ExtensionArray),
78}
79
80impl Canonical {
81    /// Create an empty canonical array of the given dtype.
82    pub fn empty(dtype: &DType) -> Canonical {
83        builder_with_capacity(dtype, 0)
84            .finish()
85            .to_canonical()
86            .vortex_expect("cannot fail to convert an empty array to canonical")
87    }
88}
89
90impl Canonical {
91    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
92    ///
93    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
94    /// they can accumulate wasted space after slicing and taking operations.
95    ///
96    /// This operation is very expensive and can result in things like allocations, full-scans
97    /// and copy operations.
98    pub fn compact(&self) -> VortexResult<Canonical> {
99        match self {
100            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
101            Canonical::List(array) => Ok(Canonical::List(array.reset_offsets()?)),
102            _ => Ok(self.clone()),
103        }
104    }
105}
106
107// Unwrap canonical type back down to specialized type.
108impl Canonical {
109    pub fn into_null(self) -> VortexResult<NullArray> {
110        if let Canonical::Null(a) = self {
111            Ok(a)
112        } else {
113            vortex_bail!("Cannot unwrap NullArray from {:?}", &self)
114        }
115    }
116
117    pub fn into_bool(self) -> VortexResult<BoolArray> {
118        if let Canonical::Bool(a) = self {
119            Ok(a)
120        } else {
121            vortex_bail!("Cannot unwrap BoolArray from {:?}", &self)
122        }
123    }
124
125    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
126        if let Canonical::Primitive(a) = self {
127            Ok(a)
128        } else {
129            vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self)
130        }
131    }
132
133    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
134        if let Canonical::Decimal(a) = self {
135            Ok(a)
136        } else {
137            vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self)
138        }
139    }
140
141    pub fn into_struct(self) -> VortexResult<StructArray> {
142        if let Canonical::Struct(a) = self {
143            Ok(a)
144        } else {
145            vortex_bail!("Cannot unwrap StructArray from {:?}", &self)
146        }
147    }
148
149    pub fn into_list(self) -> VortexResult<ListArray> {
150        if let Canonical::List(a) = self {
151            Ok(a)
152        } else {
153            vortex_bail!("Cannot unwrap ListArray from {:?}", &self)
154        }
155    }
156
157    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
158        if let Canonical::VarBinView(a) = self {
159            Ok(a)
160        } else {
161            vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self)
162        }
163    }
164
165    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
166        if let Canonical::Extension(a) = self {
167            Ok(a)
168        } else {
169            vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self)
170        }
171    }
172}
173
174impl AsRef<dyn Array> for Canonical {
175    fn as_ref(&self) -> &(dyn Array + 'static) {
176        match &self {
177            Canonical::Null(a) => a.as_ref(),
178            Canonical::Bool(a) => a.as_ref(),
179            Canonical::Primitive(a) => a.as_ref(),
180            Canonical::Decimal(a) => a.as_ref(),
181            Canonical::Struct(a) => a.as_ref(),
182            Canonical::List(a) => a.as_ref(),
183            Canonical::VarBinView(a) => a.as_ref(),
184            Canonical::Extension(a) => a.as_ref(),
185        }
186    }
187}
188
189impl IntoArray for Canonical {
190    fn into_array(self) -> ArrayRef {
191        match self {
192            Canonical::Null(a) => a.into_array(),
193            Canonical::Bool(a) => a.into_array(),
194            Canonical::Primitive(a) => a.into_array(),
195            Canonical::Decimal(a) => a.into_array(),
196            Canonical::Struct(a) => a.into_array(),
197            Canonical::List(a) => a.into_array(),
198            Canonical::VarBinView(a) => a.into_array(),
199            Canonical::Extension(a) => a.into_array(),
200        }
201    }
202}
203
204/// Trait for types that can be converted from an owned type into an owned array variant.
205///
206/// # Canonicalization
207///
208/// This trait has a blanket implementation for all types implementing [ToCanonical].
209pub trait ToCanonical {
210    /// Canonicalize into a [`NullArray`] if the target is [`Null`][DType::Null] typed.
211    fn to_null(&self) -> VortexResult<NullArray>;
212
213    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`][DType::Bool] typed.
214    fn to_bool(&self) -> VortexResult<BoolArray>;
215
216    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`][DType::Primitive]
217    /// typed.
218    fn to_primitive(&self) -> VortexResult<PrimitiveArray>;
219
220    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`][DType::Decimal]
221    /// typed.
222    fn to_decimal(&self) -> VortexResult<DecimalArray>;
223
224    /// Canonicalize into a [`StructArray`] if the target is [`Struct`][DType::Struct] typed.
225    fn to_struct(&self) -> VortexResult<StructArray>;
226
227    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
228    fn to_list(&self) -> VortexResult<ListArray>;
229
230    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`][DType::Utf8]
231    /// or [`Binary`][DType::Binary] typed.
232    fn to_varbinview(&self) -> VortexResult<VarBinViewArray>;
233
234    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`][DType::Extension]
235    /// typed.
236    fn to_extension(&self) -> VortexResult<ExtensionArray>;
237}
238
239// Blanket impl for all Array encodings.
240impl<A: Array + ?Sized> ToCanonical for A {
241    fn to_null(&self) -> VortexResult<NullArray> {
242        self.to_canonical()?.into_null()
243    }
244
245    fn to_bool(&self) -> VortexResult<BoolArray> {
246        self.to_canonical()?.into_bool()
247    }
248
249    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
250        self.to_canonical()?.into_primitive()
251    }
252
253    fn to_decimal(&self) -> VortexResult<DecimalArray> {
254        self.to_canonical()?.into_decimal()
255    }
256
257    fn to_struct(&self) -> VortexResult<StructArray> {
258        self.to_canonical()?.into_struct()
259    }
260
261    fn to_list(&self) -> VortexResult<ListArray> {
262        self.to_canonical()?.into_list()
263    }
264
265    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
266        self.to_canonical()?.into_varbinview()
267    }
268
269    fn to_extension(&self) -> VortexResult<ExtensionArray> {
270        self.to_canonical()?.into_extension()
271    }
272}
273
274impl From<Canonical> for ArrayRef {
275    fn from(value: Canonical) -> Self {
276        match value {
277            Canonical::Null(a) => a.into_array(),
278            Canonical::Bool(a) => a.into_array(),
279            Canonical::Primitive(a) => a.into_array(),
280            Canonical::Decimal(a) => a.into_array(),
281            Canonical::Struct(a) => a.into_array(),
282            Canonical::List(a) => a.into_array(),
283            Canonical::VarBinView(a) => a.into_array(),
284            Canonical::Extension(a) => a.into_array(),
285        }
286    }
287}
288
289#[cfg(test)]
290mod test {
291    use std::sync::Arc;
292
293    use arrow_array::cast::AsArray;
294    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
295    use arrow_array::{
296        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
297        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
298        StructArray as ArrowStructArray,
299    };
300    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
301    use arrow_schema::{DataType, Field};
302    use vortex_buffer::buffer;
303
304    use crate::arrays::{ConstantArray, StructArray};
305    use crate::arrow::{FromArrowArray, IntoArrowArray};
306    use crate::{ArrayRef, IntoArray};
307
308    #[test]
309    fn test_canonicalize_nested_struct() {
310        // Create a struct array with multiple internal components.
311        let nested_struct_array = StructArray::from_fields(&[
312            ("a", buffer![1u64].into_array()),
313            (
314                "b",
315                StructArray::from_fields(&[(
316                    "inner_a",
317                    // The nested struct contains a ConstantArray representing the primitive array
318                    //   [100i64]
319                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
320                    // map this to the nearest canonical type (PrimitiveArray).
321                    ConstantArray::new(100i64, 1).into_array(),
322                )])
323                .unwrap()
324                .into_array(),
325            ),
326        ])
327        .unwrap();
328
329        let arrow_struct = nested_struct_array
330            .into_array()
331            .into_arrow_preferred()
332            .unwrap()
333            .as_any()
334            .downcast_ref::<ArrowStructArray>()
335            .cloned()
336            .unwrap();
337
338        assert!(
339            arrow_struct
340                .column(0)
341                .as_any()
342                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
343                .is_some()
344        );
345
346        let inner_struct = arrow_struct
347            .column(1)
348            .clone()
349            .as_any()
350            .downcast_ref::<ArrowStructArray>()
351            .cloned()
352            .unwrap();
353
354        let inner_a = inner_struct
355            .column(0)
356            .as_any()
357            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
358        assert!(inner_a.is_some());
359
360        assert_eq!(
361            inner_a.cloned().unwrap(),
362            ArrowPrimitiveArray::from_iter([100i64]),
363        );
364    }
365
366    #[test]
367    fn roundtrip_struct() {
368        let mut nulls = NullBufferBuilder::new(6);
369        nulls.append_n_non_nulls(4);
370        nulls.append_null();
371        nulls.append_non_null();
372        let names = Arc::new(StringViewArray::from_iter(vec![
373            Some("Joseph"),
374            None,
375            Some("Angela"),
376            Some("Mikhail"),
377            None,
378            None,
379        ]));
380        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
381            Some(25),
382            Some(31),
383            None,
384            Some(57),
385            None,
386            None,
387        ]));
388
389        let arrow_struct = ArrowStructArray::new(
390            vec![
391                Arc::new(Field::new("name", DataType::Utf8View, true)),
392                Arc::new(Field::new("age", DataType::Int32, true)),
393            ]
394            .into(),
395            vec![names, ages],
396            nulls.finish(),
397        );
398
399        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
400
401        assert_eq!(
402            &arrow_struct,
403            vortex_struct.into_arrow_preferred().unwrap().as_struct()
404        );
405    }
406
407    #[test]
408    fn roundtrip_list() {
409        let names = Arc::new(StringArray::from_iter(vec![
410            Some("Joseph"),
411            Some("Angela"),
412            Some("Mikhail"),
413        ]));
414
415        let arrow_list = ArrowListArray::new(
416            Arc::new(Field::new_list_field(DataType::Utf8, true)),
417            OffsetBuffer::from_lengths(vec![0, 2, 1]),
418            names,
419            None,
420        );
421        let list_data_type = arrow_list.data_type();
422
423        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
424
425        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
426
427        assert_eq!(
428            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
429            rt_arrow_list.as_ref()
430        );
431    }
432}