vortex_array/
canonical.rs

1//! Encodings that enable zero-copy sharing of data with Arrow.
2
3use vortex_dtype::DType;
4use vortex_error::{VortexExpect, VortexResult, vortex_bail};
5
6use crate::arrays::{
7    BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray,
8    VarBinViewArray,
9};
10use crate::builders::builder_with_capacity;
11use crate::{Array, ArrayRef, IntoArray};
12
13/// The set of canonical array encodings, also the set of encodings that can be transferred to
14/// Arrow with zero-copy.
15///
16/// Note that a canonical form is not recursive, i.e. a StructArray may contain non-canonical
17/// child arrays, which may themselves need to be [canonicalized](ToCanonical).
18///
19/// # Logical vs. Physical encodings
20///
21/// Vortex separates logical and physical types, however this creates ambiguity with Arrow, there is
22/// no separation. Thus, if you receive an Arrow array, compress it using Vortex, and then
23/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
24/// variants to hold the data.
25///
26/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
27/// will correspond to an arrow-rs [`arrow_schema::DataType`].
28///
29/// # Views support
30///
31/// Binary and String views, also known as "German strings" are a better encoding format for
32/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
33/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
34/// for all `Utf8` and `Binary` typed arrays in Vortex.
35///
36#[derive(Debug, Clone)]
37pub enum Canonical {
38    Null(NullArray),
39    Bool(BoolArray),
40    Primitive(PrimitiveArray),
41    Decimal(DecimalArray),
42    Struct(StructArray),
43    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
44    List(ListArray),
45    VarBinView(VarBinViewArray),
46    Extension(ExtensionArray),
47}
48
49impl Canonical {
50    // Create an empty canonical array of the given dtype.
51    pub fn empty(dtype: &DType) -> Canonical {
52        builder_with_capacity(dtype, 0)
53            .finish()
54            .to_canonical()
55            .vortex_expect("cannot fail to convert an empty array to canonical")
56    }
57}
58
59// Unwrap canonical type back down to specialized type.
60impl Canonical {
61    pub fn into_null(self) -> VortexResult<NullArray> {
62        match self {
63            Canonical::Null(a) => Ok(a),
64            _ => vortex_bail!("Cannot unwrap NullArray from {:?}", &self),
65        }
66    }
67
68    pub fn into_bool(self) -> VortexResult<BoolArray> {
69        match self {
70            Canonical::Bool(a) => Ok(a),
71            _ => vortex_bail!("Cannot unwrap BoolArray from {:?}", &self),
72        }
73    }
74
75    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
76        match self {
77            Canonical::Primitive(a) => Ok(a),
78            _ => vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self),
79        }
80    }
81
82    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
83        match self {
84            Canonical::Decimal(a) => Ok(a),
85            _ => vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self),
86        }
87    }
88
89    pub fn into_struct(self) -> VortexResult<StructArray> {
90        match self {
91            Canonical::Struct(a) => Ok(a),
92            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
93        }
94    }
95
96    pub fn into_list(self) -> VortexResult<ListArray> {
97        match self {
98            Canonical::List(a) => Ok(a),
99            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
100        }
101    }
102
103    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
104        match self {
105            Canonical::VarBinView(a) => Ok(a),
106            _ => vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self),
107        }
108    }
109
110    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
111        match self {
112            Canonical::Extension(a) => Ok(a),
113            _ => vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self),
114        }
115    }
116}
117
118impl AsRef<dyn Array> for Canonical {
119    fn as_ref(&self) -> &(dyn Array + 'static) {
120        match &self {
121            Canonical::Null(a) => a.as_ref(),
122            Canonical::Bool(a) => a.as_ref(),
123            Canonical::Primitive(a) => a.as_ref(),
124            Canonical::Decimal(a) => a.as_ref(),
125            Canonical::Struct(a) => a.as_ref(),
126            Canonical::List(a) => a.as_ref(),
127            Canonical::VarBinView(a) => a.as_ref(),
128            Canonical::Extension(a) => a.as_ref(),
129        }
130    }
131}
132
133impl IntoArray for Canonical {
134    fn into_array(self) -> ArrayRef {
135        match self {
136            Canonical::Null(a) => a.into_array(),
137            Canonical::Bool(a) => a.into_array(),
138            Canonical::Primitive(a) => a.into_array(),
139            Canonical::Decimal(a) => a.into_array(),
140            Canonical::Struct(a) => a.into_array(),
141            Canonical::List(a) => a.into_array(),
142            Canonical::VarBinView(a) => a.into_array(),
143            Canonical::Extension(a) => a.into_array(),
144        }
145    }
146}
147
148/// Trait for types that can be converted from an owned type into an owned array variant.
149///
150/// # Canonicalization
151///
152/// This trait has a blanket implementation for all types implementing [ToCanonical].
153pub trait ToCanonical: Array {
154    fn to_null(&self) -> VortexResult<NullArray> {
155        self.to_canonical()?.into_null()
156    }
157
158    fn to_bool(&self) -> VortexResult<BoolArray> {
159        self.to_canonical()?.into_bool()
160    }
161
162    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
163        self.to_canonical()?.into_primitive()
164    }
165
166    fn to_decimal(&self) -> VortexResult<DecimalArray> {
167        self.to_canonical()?.into_decimal()
168    }
169
170    fn to_struct(&self) -> VortexResult<StructArray> {
171        self.to_canonical()?.into_struct()
172    }
173
174    fn to_list(&self) -> VortexResult<ListArray> {
175        self.to_canonical()?.into_list()
176    }
177
178    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
179        self.to_canonical()?.into_varbinview()
180    }
181
182    fn to_extension(&self) -> VortexResult<ExtensionArray> {
183        self.to_canonical()?.into_extension()
184    }
185}
186
187impl<A: Array + ?Sized> ToCanonical for A {}
188
189/// This conversion is always "free" and should not touch underlying data. All it does is create an
190/// owned pointer to the underlying concrete array type.
191///
192/// This combined with the above [ToCanonical] impl for [ArrayRef] allows simple two-way conversions
193/// between arbitrary Vortex encodings and canonical Arrow-compatible encodings.
194impl From<Canonical> for ArrayRef {
195    fn from(value: Canonical) -> Self {
196        match value {
197            Canonical::Null(a) => a.into_array(),
198            Canonical::Bool(a) => a.into_array(),
199            Canonical::Primitive(a) => a.into_array(),
200            Canonical::Decimal(a) => a.into_array(),
201            Canonical::Struct(a) => a.into_array(),
202            Canonical::List(a) => a.into_array(),
203            Canonical::VarBinView(a) => a.into_array(),
204            Canonical::Extension(a) => a.into_array(),
205        }
206    }
207}
208
209#[cfg(test)]
210mod test {
211    use std::sync::Arc;
212
213    use arrow_array::cast::AsArray;
214    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
215    use arrow_array::{
216        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
217        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
218        StructArray as ArrowStructArray,
219    };
220    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
221    use arrow_schema::{DataType, Field};
222    use vortex_buffer::buffer;
223
224    use crate::arrays::{ConstantArray, StructArray};
225    use crate::arrow::{FromArrowArray, IntoArrowArray};
226    use crate::{ArrayRef, IntoArray};
227
228    #[test]
229    fn test_canonicalize_nested_struct() {
230        // Create a struct array with multiple internal components.
231        let nested_struct_array = StructArray::from_fields(&[
232            ("a", buffer![1u64].into_array()),
233            (
234                "b",
235                StructArray::from_fields(&[(
236                    "inner_a",
237                    // The nested struct contains a ConstantArray representing the primitive array
238                    //   [100i64]
239                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
240                    // map this to the nearest canonical type (PrimitiveArray).
241                    ConstantArray::new(100i64, 1).into_array(),
242                )])
243                .unwrap()
244                .into_array(),
245            ),
246        ])
247        .unwrap();
248
249        let arrow_struct = nested_struct_array
250            .into_array()
251            .into_arrow_preferred()
252            .unwrap()
253            .as_any()
254            .downcast_ref::<ArrowStructArray>()
255            .cloned()
256            .unwrap();
257
258        assert!(
259            arrow_struct
260                .column(0)
261                .as_any()
262                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
263                .is_some()
264        );
265
266        let inner_struct = arrow_struct
267            .column(1)
268            .clone()
269            .as_any()
270            .downcast_ref::<ArrowStructArray>()
271            .cloned()
272            .unwrap();
273
274        let inner_a = inner_struct
275            .column(0)
276            .as_any()
277            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
278        assert!(inner_a.is_some());
279
280        assert_eq!(
281            inner_a.cloned().unwrap(),
282            ArrowPrimitiveArray::from_iter([100i64]),
283        );
284    }
285
286    #[test]
287    fn roundtrip_struct() {
288        let mut nulls = NullBufferBuilder::new(6);
289        nulls.append_n_non_nulls(4);
290        nulls.append_null();
291        nulls.append_non_null();
292        let names = Arc::new(StringViewArray::from_iter(vec![
293            Some("Joseph"),
294            None,
295            Some("Angela"),
296            Some("Mikhail"),
297            None,
298            None,
299        ]));
300        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
301            Some(25),
302            Some(31),
303            None,
304            Some(57),
305            None,
306            None,
307        ]));
308
309        let arrow_struct = ArrowStructArray::new(
310            vec![
311                Arc::new(Field::new("name", DataType::Utf8View, true)),
312                Arc::new(Field::new("age", DataType::Int32, true)),
313            ]
314            .into(),
315            vec![names, ages],
316            nulls.finish(),
317        );
318
319        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
320
321        assert_eq!(
322            &arrow_struct,
323            vortex_struct.into_arrow_preferred().unwrap().as_struct()
324        );
325    }
326
327    #[test]
328    fn roundtrip_list() {
329        let names = Arc::new(StringArray::from_iter(vec![
330            Some("Joseph"),
331            Some("Angela"),
332            Some("Mikhail"),
333        ]));
334
335        let arrow_list = ArrowListArray::new(
336            Arc::new(Field::new_list_field(DataType::Utf8, true)),
337            OffsetBuffer::from_lengths(vec![0, 2, 1]),
338            names,
339            None,
340        );
341        let list_data_type = arrow_list.data_type();
342
343        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
344
345        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
346
347        assert_eq!(
348            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
349            rt_arrow_list.as_ref()
350        );
351    }
352}