vortex_array/
canonical.rs

1//! Encodings that enable zero-copy sharing of data with Arrow.
2
3use vortex_dtype::DType;
4use vortex_error::{VortexExpect, VortexResult, vortex_bail};
5
6use crate::arrays::{
7    BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray,
8    VarBinViewArray,
9};
10use crate::builders::builder_with_capacity;
11use crate::{Array, ArrayRef, IntoArray};
12
13/// An enum capturing the default uncompressed encodings for each [Vortex type][DType].
14///
15/// Any array can be decoded into canonical form via the [`to_canonical`][Array::to_canonical]
16/// trait method. This is the simplest encoding for a type, and will not be compressed but may
17/// contain compressed child arrays.
18///
19/// Canonical form is useful for doing type-specific compute where you need to know that all
20/// elements are laid out decompressed and contiguous in memory.
21///
22/// # Laziness
23///
24/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
25/// `Struct` type, individual column child arrays may still be compressed. This allows
26/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
27/// never need to be decoded into canonical form at all depending on the compute.
28///
29/// # Arrow interoperability
30///
31/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
32/// zero-copy, and the corresponding Arrow array types can also be built directly.
33///
34/// The full list of canonical types and their equivalent Arrow array types are:
35///
36/// * `NullArray`: [`arrow_array::NullArray`]
37/// * `BoolArray`: [`arrow_array::BooleanArray`]
38/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
39/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
40/// * `StructArray`: [`arrow_array::StructArray`]
41/// * `ListArray`: [`arrow_array::ListArray`]
42/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
43///
44/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
45/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
46/// create ambiguity.
47/// Thus, if you receive an Arrow array, compress it using Vortex, and then
48/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
49/// variants to hold the data.
50///
51/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
52/// will correspond to an arrow-rs [`arrow_schema::DataType`].
53///
54/// # Views support
55///
56/// Binary and String views, also known as "German strings" are a better encoding format for
57/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
58/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
59/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
60/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
61/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
62/// from memory.
63#[derive(Debug, Clone)]
64pub enum Canonical {
65    Null(NullArray),
66    Bool(BoolArray),
67    Primitive(PrimitiveArray),
68    Decimal(DecimalArray),
69    Struct(StructArray),
70    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
71    List(ListArray),
72    VarBinView(VarBinViewArray),
73    Extension(ExtensionArray),
74}
75
76impl Canonical {
77    /// Create an empty canonical array of the given dtype.
78    pub fn empty(dtype: &DType) -> Canonical {
79        builder_with_capacity(dtype, 0)
80            .finish()
81            .to_canonical()
82            .vortex_expect("cannot fail to convert an empty array to canonical")
83    }
84}
85
86// Unwrap canonical type back down to specialized type.
87impl Canonical {
88    pub fn into_null(self) -> VortexResult<NullArray> {
89        match self {
90            Canonical::Null(a) => Ok(a),
91            _ => vortex_bail!("Cannot unwrap NullArray from {:?}", &self),
92        }
93    }
94
95    pub fn into_bool(self) -> VortexResult<BoolArray> {
96        match self {
97            Canonical::Bool(a) => Ok(a),
98            _ => vortex_bail!("Cannot unwrap BoolArray from {:?}", &self),
99        }
100    }
101
102    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
103        match self {
104            Canonical::Primitive(a) => Ok(a),
105            _ => vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self),
106        }
107    }
108
109    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
110        match self {
111            Canonical::Decimal(a) => Ok(a),
112            _ => vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self),
113        }
114    }
115
116    pub fn into_struct(self) -> VortexResult<StructArray> {
117        match self {
118            Canonical::Struct(a) => Ok(a),
119            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
120        }
121    }
122
123    pub fn into_list(self) -> VortexResult<ListArray> {
124        match self {
125            Canonical::List(a) => Ok(a),
126            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
127        }
128    }
129
130    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
131        match self {
132            Canonical::VarBinView(a) => Ok(a),
133            _ => vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self),
134        }
135    }
136
137    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
138        match self {
139            Canonical::Extension(a) => Ok(a),
140            _ => vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self),
141        }
142    }
143}
144
145impl AsRef<dyn Array> for Canonical {
146    fn as_ref(&self) -> &(dyn Array + 'static) {
147        match &self {
148            Canonical::Null(a) => a.as_ref(),
149            Canonical::Bool(a) => a.as_ref(),
150            Canonical::Primitive(a) => a.as_ref(),
151            Canonical::Decimal(a) => a.as_ref(),
152            Canonical::Struct(a) => a.as_ref(),
153            Canonical::List(a) => a.as_ref(),
154            Canonical::VarBinView(a) => a.as_ref(),
155            Canonical::Extension(a) => a.as_ref(),
156        }
157    }
158}
159
160impl IntoArray for Canonical {
161    fn into_array(self) -> ArrayRef {
162        match self {
163            Canonical::Null(a) => a.into_array(),
164            Canonical::Bool(a) => a.into_array(),
165            Canonical::Primitive(a) => a.into_array(),
166            Canonical::Decimal(a) => a.into_array(),
167            Canonical::Struct(a) => a.into_array(),
168            Canonical::List(a) => a.into_array(),
169            Canonical::VarBinView(a) => a.into_array(),
170            Canonical::Extension(a) => a.into_array(),
171        }
172    }
173}
174
175/// Trait for types that can be converted from an owned type into an owned array variant.
176///
177/// # Canonicalization
178///
179/// This trait has a blanket implementation for all types implementing [ToCanonical].
180pub trait ToCanonical {
181    /// Canonicalize into a [`NullArray`] if the target is [`Null`][DType::Null] typed.
182    fn to_null(&self) -> VortexResult<NullArray>;
183
184    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`][DType::Bool] typed.
185    fn to_bool(&self) -> VortexResult<BoolArray>;
186
187    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`][DType::Primitive]
188    /// typed.
189    fn to_primitive(&self) -> VortexResult<PrimitiveArray>;
190
191    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`][DType::Decimal]
192    /// typed.
193    fn to_decimal(&self) -> VortexResult<DecimalArray>;
194
195    /// Canonicalize into a [`StructArray`] if the target is [`Struct`][DType::Struct] typed.
196    fn to_struct(&self) -> VortexResult<StructArray>;
197
198    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
199    fn to_list(&self) -> VortexResult<ListArray>;
200
201    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`][DType::Utf8]
202    /// or [`Binary`][DType::Binary] typed.
203    fn to_varbinview(&self) -> VortexResult<VarBinViewArray>;
204
205    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`][DType::Extension]
206    /// typed.
207    fn to_extension(&self) -> VortexResult<ExtensionArray>;
208}
209
210// Blanket impl for all Array encodings.
211impl<A: Array + ?Sized> ToCanonical for A {
212    fn to_null(&self) -> VortexResult<NullArray> {
213        self.to_canonical()?.into_null()
214    }
215
216    fn to_bool(&self) -> VortexResult<BoolArray> {
217        self.to_canonical()?.into_bool()
218    }
219
220    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
221        self.to_canonical()?.into_primitive()
222    }
223
224    fn to_decimal(&self) -> VortexResult<DecimalArray> {
225        self.to_canonical()?.into_decimal()
226    }
227
228    fn to_struct(&self) -> VortexResult<StructArray> {
229        self.to_canonical()?.into_struct()
230    }
231
232    fn to_list(&self) -> VortexResult<ListArray> {
233        self.to_canonical()?.into_list()
234    }
235
236    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
237        self.to_canonical()?.into_varbinview()
238    }
239
240    fn to_extension(&self) -> VortexResult<ExtensionArray> {
241        self.to_canonical()?.into_extension()
242    }
243}
244
245impl From<Canonical> for ArrayRef {
246    fn from(value: Canonical) -> Self {
247        match value {
248            Canonical::Null(a) => a.into_array(),
249            Canonical::Bool(a) => a.into_array(),
250            Canonical::Primitive(a) => a.into_array(),
251            Canonical::Decimal(a) => a.into_array(),
252            Canonical::Struct(a) => a.into_array(),
253            Canonical::List(a) => a.into_array(),
254            Canonical::VarBinView(a) => a.into_array(),
255            Canonical::Extension(a) => a.into_array(),
256        }
257    }
258}
259
260#[cfg(test)]
261mod test {
262    use std::sync::Arc;
263
264    use arrow_array::cast::AsArray;
265    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
266    use arrow_array::{
267        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
268        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
269        StructArray as ArrowStructArray,
270    };
271    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
272    use arrow_schema::{DataType, Field};
273    use vortex_buffer::buffer;
274
275    use crate::arrays::{ConstantArray, StructArray};
276    use crate::arrow::{FromArrowArray, IntoArrowArray};
277    use crate::{ArrayRef, IntoArray};
278
279    #[test]
280    fn test_canonicalize_nested_struct() {
281        // Create a struct array with multiple internal components.
282        let nested_struct_array = StructArray::from_fields(&[
283            ("a", buffer![1u64].into_array()),
284            (
285                "b",
286                StructArray::from_fields(&[(
287                    "inner_a",
288                    // The nested struct contains a ConstantArray representing the primitive array
289                    //   [100i64]
290                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
291                    // map this to the nearest canonical type (PrimitiveArray).
292                    ConstantArray::new(100i64, 1).into_array(),
293                )])
294                .unwrap()
295                .into_array(),
296            ),
297        ])
298        .unwrap();
299
300        let arrow_struct = nested_struct_array
301            .into_array()
302            .into_arrow_preferred()
303            .unwrap()
304            .as_any()
305            .downcast_ref::<ArrowStructArray>()
306            .cloned()
307            .unwrap();
308
309        assert!(
310            arrow_struct
311                .column(0)
312                .as_any()
313                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
314                .is_some()
315        );
316
317        let inner_struct = arrow_struct
318            .column(1)
319            .clone()
320            .as_any()
321            .downcast_ref::<ArrowStructArray>()
322            .cloned()
323            .unwrap();
324
325        let inner_a = inner_struct
326            .column(0)
327            .as_any()
328            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
329        assert!(inner_a.is_some());
330
331        assert_eq!(
332            inner_a.cloned().unwrap(),
333            ArrowPrimitiveArray::from_iter([100i64]),
334        );
335    }
336
337    #[test]
338    fn roundtrip_struct() {
339        let mut nulls = NullBufferBuilder::new(6);
340        nulls.append_n_non_nulls(4);
341        nulls.append_null();
342        nulls.append_non_null();
343        let names = Arc::new(StringViewArray::from_iter(vec![
344            Some("Joseph"),
345            None,
346            Some("Angela"),
347            Some("Mikhail"),
348            None,
349            None,
350        ]));
351        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
352            Some(25),
353            Some(31),
354            None,
355            Some(57),
356            None,
357            None,
358        ]));
359
360        let arrow_struct = ArrowStructArray::new(
361            vec![
362                Arc::new(Field::new("name", DataType::Utf8View, true)),
363                Arc::new(Field::new("age", DataType::Int32, true)),
364            ]
365            .into(),
366            vec![names, ages],
367            nulls.finish(),
368        );
369
370        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
371
372        assert_eq!(
373            &arrow_struct,
374            vortex_struct.into_arrow_preferred().unwrap().as_struct()
375        );
376    }
377
378    #[test]
379    fn roundtrip_list() {
380        let names = Arc::new(StringArray::from_iter(vec![
381            Some("Joseph"),
382            Some("Angela"),
383            Some("Mikhail"),
384        ]));
385
386        let arrow_list = ArrowListArray::new(
387            Arc::new(Field::new_list_field(DataType::Utf8, true)),
388            OffsetBuffer::from_lengths(vec![0, 2, 1]),
389            names,
390            None,
391        );
392        let list_data_type = arrow_list.data_type();
393
394        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
395
396        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
397
398        assert_eq!(
399            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
400            rt_arrow_list.as_ref()
401        );
402    }
403}