vortex_array/
canonical.rs

1//! Encodings that enable zero-copy sharing of data with Arrow.
2
3use arrow_array::ArrayRef as ArrowArrayRef;
4use arrow_schema::DataType;
5use vortex_dtype::DType;
6use vortex_error::{VortexExpect, VortexResult, vortex_bail};
7
8use crate::arrays::{
9    BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray,
10    VarBinViewArray,
11};
12use crate::arrow::IntoArrowArray;
13use crate::builders::builder_with_capacity;
14use crate::compute::{preferred_arrow_data_type, to_arrow};
15use crate::{Array, ArrayRef, IntoArray};
16
17/// The set of canonical array encodings, also the set of encodings that can be transferred to
18/// Arrow with zero-copy.
19///
20/// Note that a canonical form is not recursive, i.e. a StructArray may contain non-canonical
21/// child arrays, which may themselves need to be [canonicalized](ToCanonical).
22///
23/// # Logical vs. Physical encodings
24///
25/// Vortex separates logical and physical types, however this creates ambiguity with Arrow, there is
26/// no separation. Thus, if you receive an Arrow array, compress it using Vortex, and then
27/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
28/// variants to hold the data.
29///
30/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
31/// will correspond to an arrow-rs [`arrow_schema::DataType`].
32///
33/// # Views support
34///
35/// Binary and String views, also known as "German strings" are a better encoding format for
36/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
37/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
38/// for all `Utf8` and `Binary` typed arrays in Vortex.
39///
40#[derive(Debug, Clone)]
41pub enum Canonical {
42    Null(NullArray),
43    Bool(BoolArray),
44    Primitive(PrimitiveArray),
45    Decimal(DecimalArray),
46    Struct(StructArray),
47    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
48    List(ListArray),
49    VarBinView(VarBinViewArray),
50    Extension(ExtensionArray),
51}
52
53impl Canonical {
54    // Create an empty canonical array of the given dtype.
55    pub fn empty(dtype: &DType) -> Canonical {
56        builder_with_capacity(dtype, 0)
57            .finish()
58            .to_canonical()
59            .vortex_expect("cannot fail to convert an empty array to canonical")
60    }
61}
62
63// Unwrap canonical type back down to specialized type.
64impl Canonical {
65    pub fn into_null(self) -> VortexResult<NullArray> {
66        match self {
67            Canonical::Null(a) => Ok(a),
68            _ => vortex_bail!("Cannot unwrap NullArray from {:?}", &self),
69        }
70    }
71
72    pub fn into_bool(self) -> VortexResult<BoolArray> {
73        match self {
74            Canonical::Bool(a) => Ok(a),
75            _ => vortex_bail!("Cannot unwrap BoolArray from {:?}", &self),
76        }
77    }
78
79    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
80        match self {
81            Canonical::Primitive(a) => Ok(a),
82            _ => vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self),
83        }
84    }
85
86    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
87        match self {
88            Canonical::Decimal(a) => Ok(a),
89            _ => vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self),
90        }
91    }
92
93    pub fn into_struct(self) -> VortexResult<StructArray> {
94        match self {
95            Canonical::Struct(a) => Ok(a),
96            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
97        }
98    }
99
100    pub fn into_list(self) -> VortexResult<ListArray> {
101        match self {
102            Canonical::List(a) => Ok(a),
103            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
104        }
105    }
106
107    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
108        match self {
109            Canonical::VarBinView(a) => Ok(a),
110            _ => vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self),
111        }
112    }
113
114    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
115        match self {
116            Canonical::Extension(a) => Ok(a),
117            _ => vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self),
118        }
119    }
120}
121
122impl AsRef<dyn Array> for Canonical {
123    fn as_ref(&self) -> &(dyn Array + 'static) {
124        match &self {
125            Canonical::Null(a) => a,
126            Canonical::Bool(a) => a,
127            Canonical::Primitive(a) => a,
128            Canonical::Decimal(a) => a,
129            Canonical::Struct(a) => a,
130            Canonical::List(a) => a,
131            Canonical::VarBinView(a) => a,
132            Canonical::Extension(a) => a,
133        }
134    }
135}
136
137impl IntoArray for Canonical {
138    fn into_array(self) -> ArrayRef {
139        match self {
140            Canonical::Null(a) => a.into_array(),
141            Canonical::Bool(a) => a.into_array(),
142            Canonical::Primitive(a) => a.into_array(),
143            Canonical::Decimal(a) => a.into_array(),
144            Canonical::Struct(a) => a.into_array(),
145            Canonical::List(a) => a.into_array(),
146            Canonical::VarBinView(a) => a.into_array(),
147            Canonical::Extension(a) => a.into_array(),
148        }
149    }
150}
151
152/// Trait for types that can be converted from an owned type into an owned array variant.
153///
154/// # Canonicalization
155///
156/// This trait has a blanket implementation for all types implementing [ToCanonical].
157pub trait ToCanonical: Array {
158    fn to_null(&self) -> VortexResult<NullArray> {
159        self.to_canonical()?.into_null()
160    }
161
162    fn to_bool(&self) -> VortexResult<BoolArray> {
163        self.to_canonical()?.into_bool()
164    }
165
166    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
167        self.to_canonical()?.into_primitive()
168    }
169
170    fn to_decimal(&self) -> VortexResult<DecimalArray> {
171        self.to_canonical()?.into_decimal()
172    }
173
174    fn to_struct(&self) -> VortexResult<StructArray> {
175        self.to_canonical()?.into_struct()
176    }
177
178    fn to_list(&self) -> VortexResult<ListArray> {
179        self.to_canonical()?.into_list()
180    }
181
182    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
183        self.to_canonical()?.into_varbinview()
184    }
185
186    fn to_extension(&self) -> VortexResult<ExtensionArray> {
187        self.to_canonical()?.into_extension()
188    }
189}
190
191impl<A: Array + ?Sized> ToCanonical for A {}
192
193impl IntoArrowArray for ArrayRef {
194    /// Convert this [`ArrayRef`] into an Arrow [`ArrayRef`] by using the array's preferred
195    /// Arrow [`DataType`].
196    fn into_arrow_preferred(self) -> VortexResult<ArrowArrayRef> {
197        let data_type = preferred_arrow_data_type(&self)?;
198        self.into_arrow(&data_type)
199    }
200
201    fn into_arrow(self, data_type: &DataType) -> VortexResult<ArrowArrayRef> {
202        to_arrow(&self, data_type)
203    }
204}
205
206/// This conversion is always "free" and should not touch underlying data. All it does is create an
207/// owned pointer to the underlying concrete array type.
208///
209/// This combined with the above [ToCanonical] impl for [ArrayRef] allows simple two-way conversions
210/// between arbitrary Vortex encodings and canonical Arrow-compatible encodings.
211impl From<Canonical> for ArrayRef {
212    fn from(value: Canonical) -> Self {
213        match value {
214            Canonical::Null(a) => a.into_array(),
215            Canonical::Bool(a) => a.into_array(),
216            Canonical::Primitive(a) => a.into_array(),
217            Canonical::Decimal(a) => a.into_array(),
218            Canonical::Struct(a) => a.into_array(),
219            Canonical::List(a) => a.into_array(),
220            Canonical::VarBinView(a) => a.into_array(),
221            Canonical::Extension(a) => a.into_array(),
222        }
223    }
224}
225
226#[cfg(test)]
227mod test {
228    use std::sync::Arc;
229
230    use arrow_array::cast::AsArray;
231    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
232    use arrow_array::{
233        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
234        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
235        StructArray as ArrowStructArray,
236    };
237    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
238    use arrow_schema::{DataType, Field};
239    use vortex_buffer::buffer;
240
241    use crate::array::Array;
242    use crate::arrays::{ConstantArray, StructArray};
243    use crate::arrow::{FromArrowArray, IntoArrowArray};
244    use crate::{ArrayRef, IntoArray};
245
246    #[test]
247    fn test_canonicalize_nested_struct() {
248        // Create a struct array with multiple internal components.
249        let nested_struct_array = StructArray::from_fields(&[
250            ("a", buffer![1u64].into_array()),
251            (
252                "b",
253                StructArray::from_fields(&[(
254                    "inner_a",
255                    // The nested struct contains a ConstantArray representing the primitive array
256                    //   [100i64]
257                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
258                    // map this to the nearest canonical type (PrimitiveArray).
259                    ConstantArray::new(100i64, 1).into_array(),
260                )])
261                .unwrap()
262                .into_array(),
263            ),
264        ])
265        .unwrap();
266
267        let arrow_struct = nested_struct_array
268            .into_array()
269            .into_arrow_preferred()
270            .unwrap()
271            .as_any()
272            .downcast_ref::<ArrowStructArray>()
273            .cloned()
274            .unwrap();
275
276        assert!(
277            arrow_struct
278                .column(0)
279                .as_any()
280                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
281                .is_some()
282        );
283
284        let inner_struct = arrow_struct
285            .column(1)
286            .clone()
287            .as_any()
288            .downcast_ref::<ArrowStructArray>()
289            .cloned()
290            .unwrap();
291
292        let inner_a = inner_struct
293            .column(0)
294            .as_any()
295            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
296        assert!(inner_a.is_some());
297
298        assert_eq!(
299            inner_a.cloned().unwrap(),
300            ArrowPrimitiveArray::from_iter([100i64]),
301        );
302    }
303
304    #[test]
305    fn roundtrip_struct() {
306        let mut nulls = NullBufferBuilder::new(6);
307        nulls.append_n_non_nulls(4);
308        nulls.append_null();
309        nulls.append_non_null();
310        let names = Arc::new(StringViewArray::from_iter(vec![
311            Some("Joseph"),
312            None,
313            Some("Angela"),
314            Some("Mikhail"),
315            None,
316            None,
317        ]));
318        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
319            Some(25),
320            Some(31),
321            None,
322            Some(57),
323            None,
324            None,
325        ]));
326
327        let arrow_struct = ArrowStructArray::new(
328            vec![
329                Arc::new(Field::new("name", DataType::Utf8View, true)),
330                Arc::new(Field::new("age", DataType::Int32, true)),
331            ]
332            .into(),
333            vec![names, ages],
334            nulls.finish(),
335        );
336
337        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
338
339        assert_eq!(
340            &arrow_struct,
341            vortex_struct.into_arrow_preferred().unwrap().as_struct()
342        );
343    }
344
345    #[test]
346    fn roundtrip_list() {
347        let names = Arc::new(StringArray::from_iter(vec![
348            Some("Joseph"),
349            Some("Angela"),
350            Some("Mikhail"),
351        ]));
352
353        let arrow_list = ArrowListArray::new(
354            Arc::new(Field::new_list_field(DataType::Utf8, true)),
355            OffsetBuffer::from_lengths(vec![0, 2, 1]),
356            names,
357            None,
358        );
359        let list_data_type = arrow_list.data_type();
360
361        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
362
363        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
364
365        assert_eq!(
366            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
367            rt_arrow_list.as_ref()
368        );
369    }
370}