vortex_array/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Encodings that enable zero-copy sharing of data with Arrow.
5
6use vortex_dtype::DType;
7use vortex_error::{VortexExpect, VortexResult, vortex_bail};
8
9use crate::arrays::{
10    BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray,
11    VarBinViewArray,
12};
13use crate::builders::builder_with_capacity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// An enum capturing the default uncompressed encodings for each [Vortex type][DType].
17///
18/// Any array can be decoded into canonical form via the [`to_canonical`][Array::to_canonical]
19/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20/// contain compressed child arrays.
21///
22/// Canonical form is useful for doing type-specific compute where you need to know that all
23/// elements are laid out decompressed and contiguous in memory.
24///
25/// # Laziness
26///
27/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
28/// `Struct` type, individual column child arrays may still be compressed. This allows
29/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
30/// never need to be decoded into canonical form at all depending on the compute.
31///
32/// # Arrow interoperability
33///
34/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
35/// zero-copy, and the corresponding Arrow array types can also be built directly.
36///
37/// The full list of canonical types and their equivalent Arrow array types are:
38///
39/// * `NullArray`: [`arrow_array::NullArray`]
40/// * `BoolArray`: [`arrow_array::BooleanArray`]
41/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
42/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
43/// * `StructArray`: [`arrow_array::StructArray`]
44/// * `ListArray`: [`arrow_array::ListArray`]
45/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
46///
47/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
48/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
49/// create ambiguity.
50/// Thus, if you receive an Arrow array, compress it using Vortex, and then
51/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
52/// variants to hold the data.
53///
54/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
55/// will correspond to an arrow-rs [`arrow_schema::DataType`].
56///
57/// # Views support
58///
59/// Binary and String views, also known as "German strings" are a better encoding format for
60/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
61/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
62/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
63/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
64/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
65/// from memory.
66#[derive(Debug, Clone)]
67pub enum Canonical {
68    Null(NullArray),
69    Bool(BoolArray),
70    Primitive(PrimitiveArray),
71    Decimal(DecimalArray),
72    Struct(StructArray),
73    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
74    List(ListArray),
75    VarBinView(VarBinViewArray),
76    Extension(ExtensionArray),
77}
78
79impl Canonical {
80    /// Create an empty canonical array of the given dtype.
81    pub fn empty(dtype: &DType) -> Canonical {
82        builder_with_capacity(dtype, 0)
83            .finish()
84            .to_canonical()
85            .vortex_expect("cannot fail to convert an empty array to canonical")
86    }
87}
88
89impl Canonical {
90    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
91    ///
92    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
93    /// they can accumulate wasted space after slicing and taking operations.
94    ///
95    /// This operation is very expensive and can result in things like allocations, full-scans
96    /// and copy operations.
97    pub fn compact(&self) -> VortexResult<Canonical> {
98        match self {
99            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(array.compact_buffers()?)),
100            Canonical::List(array) => Ok(Canonical::List(array.reset_offsets()?)),
101            other => Ok(other.clone()),
102        }
103    }
104}
105
106// Unwrap canonical type back down to specialized type.
107impl Canonical {
108    pub fn into_null(self) -> VortexResult<NullArray> {
109        match self {
110            Canonical::Null(a) => Ok(a),
111            _ => vortex_bail!("Cannot unwrap NullArray from {:?}", &self),
112        }
113    }
114
115    pub fn into_bool(self) -> VortexResult<BoolArray> {
116        match self {
117            Canonical::Bool(a) => Ok(a),
118            _ => vortex_bail!("Cannot unwrap BoolArray from {:?}", &self),
119        }
120    }
121
122    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
123        match self {
124            Canonical::Primitive(a) => Ok(a),
125            _ => vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self),
126        }
127    }
128
129    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
130        match self {
131            Canonical::Decimal(a) => Ok(a),
132            _ => vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self),
133        }
134    }
135
136    pub fn into_struct(self) -> VortexResult<StructArray> {
137        match self {
138            Canonical::Struct(a) => Ok(a),
139            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
140        }
141    }
142
143    pub fn into_list(self) -> VortexResult<ListArray> {
144        match self {
145            Canonical::List(a) => Ok(a),
146            _ => vortex_bail!("Cannot unwrap ListArray from {:?}", &self),
147        }
148    }
149
150    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
151        match self {
152            Canonical::VarBinView(a) => Ok(a),
153            _ => vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self),
154        }
155    }
156
157    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
158        match self {
159            Canonical::Extension(a) => Ok(a),
160            _ => vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self),
161        }
162    }
163}
164
165impl AsRef<dyn Array> for Canonical {
166    fn as_ref(&self) -> &(dyn Array + 'static) {
167        match &self {
168            Canonical::Null(a) => a.as_ref(),
169            Canonical::Bool(a) => a.as_ref(),
170            Canonical::Primitive(a) => a.as_ref(),
171            Canonical::Decimal(a) => a.as_ref(),
172            Canonical::Struct(a) => a.as_ref(),
173            Canonical::List(a) => a.as_ref(),
174            Canonical::VarBinView(a) => a.as_ref(),
175            Canonical::Extension(a) => a.as_ref(),
176        }
177    }
178}
179
180impl IntoArray for Canonical {
181    fn into_array(self) -> ArrayRef {
182        match self {
183            Canonical::Null(a) => a.into_array(),
184            Canonical::Bool(a) => a.into_array(),
185            Canonical::Primitive(a) => a.into_array(),
186            Canonical::Decimal(a) => a.into_array(),
187            Canonical::Struct(a) => a.into_array(),
188            Canonical::List(a) => a.into_array(),
189            Canonical::VarBinView(a) => a.into_array(),
190            Canonical::Extension(a) => a.into_array(),
191        }
192    }
193}
194
195/// Trait for types that can be converted from an owned type into an owned array variant.
196///
197/// # Canonicalization
198///
199/// This trait has a blanket implementation for all types implementing [ToCanonical].
200pub trait ToCanonical {
201    /// Canonicalize into a [`NullArray`] if the target is [`Null`][DType::Null] typed.
202    fn to_null(&self) -> VortexResult<NullArray>;
203
204    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`][DType::Bool] typed.
205    fn to_bool(&self) -> VortexResult<BoolArray>;
206
207    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`][DType::Primitive]
208    /// typed.
209    fn to_primitive(&self) -> VortexResult<PrimitiveArray>;
210
211    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`][DType::Decimal]
212    /// typed.
213    fn to_decimal(&self) -> VortexResult<DecimalArray>;
214
215    /// Canonicalize into a [`StructArray`] if the target is [`Struct`][DType::Struct] typed.
216    fn to_struct(&self) -> VortexResult<StructArray>;
217
218    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
219    fn to_list(&self) -> VortexResult<ListArray>;
220
221    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`][DType::Utf8]
222    /// or [`Binary`][DType::Binary] typed.
223    fn to_varbinview(&self) -> VortexResult<VarBinViewArray>;
224
225    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`][DType::Extension]
226    /// typed.
227    fn to_extension(&self) -> VortexResult<ExtensionArray>;
228}
229
230// Blanket impl for all Array encodings.
231impl<A: Array + ?Sized> ToCanonical for A {
232    fn to_null(&self) -> VortexResult<NullArray> {
233        self.to_canonical()?.into_null()
234    }
235
236    fn to_bool(&self) -> VortexResult<BoolArray> {
237        self.to_canonical()?.into_bool()
238    }
239
240    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
241        self.to_canonical()?.into_primitive()
242    }
243
244    fn to_decimal(&self) -> VortexResult<DecimalArray> {
245        self.to_canonical()?.into_decimal()
246    }
247
248    fn to_struct(&self) -> VortexResult<StructArray> {
249        self.to_canonical()?.into_struct()
250    }
251
252    fn to_list(&self) -> VortexResult<ListArray> {
253        self.to_canonical()?.into_list()
254    }
255
256    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
257        self.to_canonical()?.into_varbinview()
258    }
259
260    fn to_extension(&self) -> VortexResult<ExtensionArray> {
261        self.to_canonical()?.into_extension()
262    }
263}
264
265impl From<Canonical> for ArrayRef {
266    fn from(value: Canonical) -> Self {
267        match value {
268            Canonical::Null(a) => a.into_array(),
269            Canonical::Bool(a) => a.into_array(),
270            Canonical::Primitive(a) => a.into_array(),
271            Canonical::Decimal(a) => a.into_array(),
272            Canonical::Struct(a) => a.into_array(),
273            Canonical::List(a) => a.into_array(),
274            Canonical::VarBinView(a) => a.into_array(),
275            Canonical::Extension(a) => a.into_array(),
276        }
277    }
278}
279
280#[cfg(test)]
281mod test {
282    use std::sync::Arc;
283
284    use arrow_array::cast::AsArray;
285    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
286    use arrow_array::{
287        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
288        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
289        StructArray as ArrowStructArray,
290    };
291    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
292    use arrow_schema::{DataType, Field};
293    use vortex_buffer::buffer;
294
295    use crate::arrays::{ConstantArray, StructArray};
296    use crate::arrow::{FromArrowArray, IntoArrowArray};
297    use crate::{ArrayRef, IntoArray};
298
299    #[test]
300    fn test_canonicalize_nested_struct() {
301        // Create a struct array with multiple internal components.
302        let nested_struct_array = StructArray::from_fields(&[
303            ("a", buffer![1u64].into_array()),
304            (
305                "b",
306                StructArray::from_fields(&[(
307                    "inner_a",
308                    // The nested struct contains a ConstantArray representing the primitive array
309                    //   [100i64]
310                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
311                    // map this to the nearest canonical type (PrimitiveArray).
312                    ConstantArray::new(100i64, 1).into_array(),
313                )])
314                .unwrap()
315                .into_array(),
316            ),
317        ])
318        .unwrap();
319
320        let arrow_struct = nested_struct_array
321            .into_array()
322            .into_arrow_preferred()
323            .unwrap()
324            .as_any()
325            .downcast_ref::<ArrowStructArray>()
326            .cloned()
327            .unwrap();
328
329        assert!(
330            arrow_struct
331                .column(0)
332                .as_any()
333                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
334                .is_some()
335        );
336
337        let inner_struct = arrow_struct
338            .column(1)
339            .clone()
340            .as_any()
341            .downcast_ref::<ArrowStructArray>()
342            .cloned()
343            .unwrap();
344
345        let inner_a = inner_struct
346            .column(0)
347            .as_any()
348            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
349        assert!(inner_a.is_some());
350
351        assert_eq!(
352            inner_a.cloned().unwrap(),
353            ArrowPrimitiveArray::from_iter([100i64]),
354        );
355    }
356
357    #[test]
358    fn roundtrip_struct() {
359        let mut nulls = NullBufferBuilder::new(6);
360        nulls.append_n_non_nulls(4);
361        nulls.append_null();
362        nulls.append_non_null();
363        let names = Arc::new(StringViewArray::from_iter(vec![
364            Some("Joseph"),
365            None,
366            Some("Angela"),
367            Some("Mikhail"),
368            None,
369            None,
370        ]));
371        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
372            Some(25),
373            Some(31),
374            None,
375            Some(57),
376            None,
377            None,
378        ]));
379
380        let arrow_struct = ArrowStructArray::new(
381            vec![
382                Arc::new(Field::new("name", DataType::Utf8View, true)),
383                Arc::new(Field::new("age", DataType::Int32, true)),
384            ]
385            .into(),
386            vec![names, ages],
387            nulls.finish(),
388        );
389
390        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
391
392        assert_eq!(
393            &arrow_struct,
394            vortex_struct.into_arrow_preferred().unwrap().as_struct()
395        );
396    }
397
398    #[test]
399    fn roundtrip_list() {
400        let names = Arc::new(StringArray::from_iter(vec![
401            Some("Joseph"),
402            Some("Angela"),
403            Some("Mikhail"),
404        ]));
405
406        let arrow_list = ArrowListArray::new(
407            Arc::new(Field::new_list_field(DataType::Utf8, true)),
408            OffsetBuffer::from_lengths(vec![0, 2, 1]),
409            names,
410            None,
411        );
412        let list_data_type = arrow_list.data_type();
413
414        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
415
416        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
417
418        assert_eq!(
419            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
420            rt_arrow_list.as_ref()
421        );
422    }
423}