vortex_array/
canonical.rs

1//! Encodings that enable zero-copy sharing of data with Arrow.
2
3use arrow_array::ArrayRef as ArrowArrayRef;
4use arrow_schema::DataType;
5use vortex_dtype::DType;
6use vortex_error::{VortexExpect, VortexResult, vortex_bail};
7
8use crate::arrays::{
9    BoolArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray, VarBinViewArray,
10};
11use crate::arrow::IntoArrowArray;
12use crate::builders::builder_with_capacity;
13use crate::compute::{preferred_arrow_data_type, to_arrow};
14use crate::{Array, ArrayRef, IntoArray};
15
16/// The set of canonical array encodings, also the set of encodings that can be transferred to
17/// Arrow with zero-copy.
18///
19/// Note that a canonical form is not recursive, i.e. a StructArray may contain non-canonical
20/// child arrays, which may themselves need to be [canonicalized](ToCanonical).
21///
22/// # Logical vs. Physical encodings
23///
24/// Vortex separates logical and physical types, however this creates ambiguity with Arrow, there is
25/// no separation. Thus, if you receive an Arrow array, compress it using Vortex, and then
26/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
27/// variants to hold the data.
28///
29/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
30/// will correspond to an arrow-rs [`arrow_schema::DataType`].
31///
32/// # Views support
33///
34/// Binary and String views, also known as "German strings" are a better encoding format for
35/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
36/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
37/// for all `Utf8` and `Binary` typed arrays in Vortex.
38///
39#[derive(Debug, Clone)]
40pub enum Canonical {
41    Null(NullArray),
42    Bool(BoolArray),
43    Primitive(PrimitiveArray),
44    Struct(StructArray),
45    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
46    List(ListArray),
47    VarBinView(VarBinViewArray),
48    Extension(ExtensionArray),
49}
50
51impl Canonical {
52    // Create an empty canonical array of the given dtype.
53    pub fn empty(dtype: &DType) -> Canonical {
54        builder_with_capacity(dtype, 0)
55            .finish()
56            .to_canonical()
57            .vortex_expect("cannot fail to convert an empty array to canonical")
58    }
59}
60
61// Unwrap canonical type back down to specialized type.
62impl Canonical {
63    pub fn into_null(self) -> VortexResult<NullArray> {
64        match self {
65            Canonical::Null(a) => Ok(a),
66            _ => vortex_bail!("Cannot unwrap NullArray from {:?}", &self),
67        }
68    }
69
70    pub fn into_bool(self) -> VortexResult<BoolArray> {
71        match self {
72            Canonical::Bool(a) => Ok(a),
73            _ => vortex_bail!("Cannot unwrap BoolArray from {:?}", &self),
74        }
75    }
76
77    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
78        match self {
79            Canonical::Primitive(a) => Ok(a),
80            _ => vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self),
81        }
82    }
83
84    pub fn into_struct(self) -> VortexResult<StructArray> {
85        match self {
86            Canonical::Struct(a) => Ok(a),
87            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
88        }
89    }
90
91    pub fn into_list(self) -> VortexResult<ListArray> {
92        match self {
93            Canonical::List(a) => Ok(a),
94            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
95        }
96    }
97
98    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
99        match self {
100            Canonical::VarBinView(a) => Ok(a),
101            _ => vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self),
102        }
103    }
104
105    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
106        match self {
107            Canonical::Extension(a) => Ok(a),
108            _ => vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self),
109        }
110    }
111}
112
113impl AsRef<dyn Array> for Canonical {
114    fn as_ref(&self) -> &(dyn Array + 'static) {
115        match &self {
116            Canonical::Null(a) => a,
117            Canonical::Bool(a) => a,
118            Canonical::Primitive(a) => a,
119            Canonical::Struct(a) => a,
120            Canonical::List(a) => a,
121            Canonical::VarBinView(a) => a,
122            Canonical::Extension(a) => a,
123        }
124    }
125}
126
127impl IntoArray for Canonical {
128    fn into_array(self) -> ArrayRef {
129        match self {
130            Canonical::Null(a) => a.into_array(),
131            Canonical::Bool(a) => a.into_array(),
132            Canonical::Primitive(a) => a.into_array(),
133            Canonical::Struct(a) => a.into_array(),
134            Canonical::List(a) => a.into_array(),
135            Canonical::VarBinView(a) => a.into_array(),
136            Canonical::Extension(a) => a.into_array(),
137        }
138    }
139}
140
141/// Trait for types that can be converted from an owned type into an owned array variant.
142///
143/// # Canonicalization
144///
145/// This trait has a blanket implementation for all types implementing [ToCanonical].
146pub trait ToCanonical: Array {
147    fn to_null(&self) -> VortexResult<NullArray> {
148        self.to_canonical()?.into_null()
149    }
150
151    fn to_bool(&self) -> VortexResult<BoolArray> {
152        self.to_canonical()?.into_bool()
153    }
154
155    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
156        self.to_canonical()?.into_primitive()
157    }
158
159    fn to_struct(&self) -> VortexResult<StructArray> {
160        self.to_canonical()?.into_struct()
161    }
162
163    fn to_list(&self) -> VortexResult<ListArray> {
164        self.to_canonical()?.into_list()
165    }
166
167    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
168        self.to_canonical()?.into_varbinview()
169    }
170
171    fn to_extension(&self) -> VortexResult<ExtensionArray> {
172        self.to_canonical()?.into_extension()
173    }
174}
175
176impl<A: Array + ?Sized> ToCanonical for A {}
177
178impl IntoArrowArray for ArrayRef {
179    /// Convert this [`ArrayRef`] into an Arrow [`ArrayRef`] by using the array's preferred
180    /// Arrow [`DataType`].
181    fn into_arrow_preferred(self) -> VortexResult<ArrowArrayRef> {
182        let data_type = preferred_arrow_data_type(&self)?;
183        self.into_arrow(&data_type)
184    }
185
186    fn into_arrow(self, data_type: &DataType) -> VortexResult<ArrowArrayRef> {
187        to_arrow(&self, data_type)
188    }
189}
190
191/// This conversion is always "free" and should not touch underlying data. All it does is create an
192/// owned pointer to the underlying concrete array type.
193///
194/// This combined with the above [ToCanonical] impl for [ArrayRef] allows simple two-way conversions
195/// between arbitrary Vortex encodings and canonical Arrow-compatible encodings.
196impl From<Canonical> for ArrayRef {
197    fn from(value: Canonical) -> Self {
198        match value {
199            Canonical::Null(a) => a.into_array(),
200            Canonical::Bool(a) => a.into_array(),
201            Canonical::Primitive(a) => a.into_array(),
202            Canonical::Struct(a) => a.into_array(),
203            Canonical::List(a) => a.into_array(),
204            Canonical::VarBinView(a) => a.into_array(),
205            Canonical::Extension(a) => a.into_array(),
206        }
207    }
208}
209
210#[cfg(test)]
211mod test {
212    use std::sync::Arc;
213
214    use arrow_array::cast::AsArray;
215    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
216    use arrow_array::{
217        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
218        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
219        StructArray as ArrowStructArray,
220    };
221    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
222    use arrow_schema::{DataType, Field};
223    use vortex_buffer::buffer;
224
225    use crate::array::Array;
226    use crate::arrays::{ConstantArray, StructArray};
227    use crate::arrow::{FromArrowArray, IntoArrowArray};
228    use crate::{ArrayRef, IntoArray};
229
230    #[test]
231    fn test_canonicalize_nested_struct() {
232        // Create a struct array with multiple internal components.
233        let nested_struct_array = StructArray::from_fields(&[
234            ("a", buffer![1u64].into_array()),
235            (
236                "b",
237                StructArray::from_fields(&[(
238                    "inner_a",
239                    // The nested struct contains a ConstantArray representing the primitive array
240                    //   [100i64]
241                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
242                    // map this to the nearest canonical type (PrimitiveArray).
243                    ConstantArray::new(100i64, 1).into_array(),
244                )])
245                .unwrap()
246                .into_array(),
247            ),
248        ])
249        .unwrap();
250
251        let arrow_struct = nested_struct_array
252            .into_array()
253            .into_arrow_preferred()
254            .unwrap()
255            .as_any()
256            .downcast_ref::<ArrowStructArray>()
257            .cloned()
258            .unwrap();
259
260        assert!(
261            arrow_struct
262                .column(0)
263                .as_any()
264                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
265                .is_some()
266        );
267
268        let inner_struct = arrow_struct
269            .column(1)
270            .clone()
271            .as_any()
272            .downcast_ref::<ArrowStructArray>()
273            .cloned()
274            .unwrap();
275
276        let inner_a = inner_struct
277            .column(0)
278            .as_any()
279            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
280        assert!(inner_a.is_some());
281
282        assert_eq!(
283            inner_a.cloned().unwrap(),
284            ArrowPrimitiveArray::from_iter([100i64]),
285        );
286    }
287
288    #[test]
289    fn roundtrip_struct() {
290        let mut nulls = NullBufferBuilder::new(6);
291        nulls.append_n_non_nulls(4);
292        nulls.append_null();
293        nulls.append_non_null();
294        let names = Arc::new(StringViewArray::from_iter(vec![
295            Some("Joseph"),
296            None,
297            Some("Angela"),
298            Some("Mikhail"),
299            None,
300            None,
301        ]));
302        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
303            Some(25),
304            Some(31),
305            None,
306            Some(57),
307            None,
308            None,
309        ]));
310
311        let arrow_struct = ArrowStructArray::new(
312            vec![
313                Arc::new(Field::new("name", DataType::Utf8View, true)),
314                Arc::new(Field::new("age", DataType::Int32, true)),
315            ]
316            .into(),
317            vec![names, ages],
318            nulls.finish(),
319        );
320
321        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
322
323        assert_eq!(
324            &arrow_struct,
325            vortex_struct.into_arrow_preferred().unwrap().as_struct()
326        );
327    }
328
329    #[test]
330    fn roundtrip_list() {
331        let names = Arc::new(StringArray::from_iter(vec![
332            Some("Joseph"),
333            Some("Angela"),
334            Some("Mikhail"),
335        ]));
336
337        let arrow_list = ArrowListArray::new(
338            Arc::new(Field::new_list_field(DataType::Utf8, true)),
339            OffsetBuffer::from_lengths(vec![0, 2, 1]),
340            names,
341            None,
342        );
343        let list_data_type = arrow_list.data_type();
344
345        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
346
347        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
348
349        assert_eq!(
350            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
351            rt_arrow_list.as_ref()
352        );
353    }
354}