vortex_array/arrays/varbin/vtable/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use arrow_array::BinaryViewArray;
7use arrow_array::StringViewArray;
8use arrow_array::cast::AsArray;
9use arrow_schema::DataType;
10use vortex_dtype::DType;
11use vortex_error::VortexExpect;
12
13use crate::ArrayRef;
14use crate::Canonical;
15use crate::ToCanonical;
16use crate::arrays::VarBinVTable;
17use crate::arrays::varbin::VarBinArray;
18use crate::arrow::FromArrowArray;
19use crate::arrow::IntoArrowArray;
20use crate::vtable::CanonicalVTable;
21
22impl CanonicalVTable<VarBinVTable> for VarBinVTable {
23    fn canonicalize(array: &VarBinArray) -> Canonical {
24        let dtype = array.dtype().clone();
25        let nullable = dtype.is_nullable();
26
27        let array_ref = array
28            .to_array()
29            .into_arrow_preferred()
30            .vortex_expect("VarBinArray must be convertible to arrow array");
31
32        let array = match (&dtype, array_ref.data_type()) {
33            (DType::Utf8(_), DataType::Utf8) => {
34                Arc::new(StringViewArray::from(array_ref.as_string::<i32>()))
35                    as Arc<dyn arrow_array::Array>
36            }
37            (DType::Utf8(_), DataType::LargeUtf8) => {
38                Arc::new(StringViewArray::from(array_ref.as_string::<i64>()))
39                    as Arc<dyn arrow_array::Array>
40            }
41
42            (DType::Binary(_), DataType::Binary) => {
43                Arc::new(BinaryViewArray::from(array_ref.as_binary::<i32>()))
44            }
45            (DType::Binary(_), DataType::LargeBinary) => {
46                Arc::new(BinaryViewArray::from(array_ref.as_binary::<i64>()))
47            }
48            // If its already a view, no need to do anything
49            (DType::Binary(_), DataType::BinaryView) | (DType::Utf8(_), DataType::Utf8View) => {
50                array_ref
51            }
52            _ => unreachable!("VarBinArray must have Utf8 or Binary dtype, instead got: {dtype}",),
53        };
54        Canonical::VarBinView(ArrayRef::from_arrow(array.as_ref(), nullable).to_varbinview())
55    }
56}
57
58#[cfg(test)]
59mod tests {
60    use rstest::rstest;
61    use vortex_dtype::DType;
62    use vortex_dtype::Nullability;
63
64    use crate::arrays::varbin::builder::VarBinBuilder;
65    use crate::canonical::ToCanonical;
66
67    #[rstest]
68    #[case(DType::Utf8(Nullability::Nullable))]
69    #[case(DType::Binary(Nullability::Nullable))]
70    fn test_canonical_varbin(#[case] dtype: DType) {
71        let mut varbin = VarBinBuilder::<i32>::with_capacity(10);
72        varbin.append_null();
73        varbin.append_null();
74        // inlined value
75        varbin.append_value("123456789012".as_bytes());
76        // non-inlinable value
77        varbin.append_value("1234567890123".as_bytes());
78        let varbin = varbin.finish(dtype.clone());
79
80        let canonical = varbin.to_varbinview();
81        assert_eq!(canonical.dtype(), &dtype);
82
83        assert!(!canonical.is_valid(0));
84        assert!(!canonical.is_valid(1));
85
86        // First value is inlined (12 bytes)
87        assert!(canonical.views()[2].is_inlined());
88        assert_eq!(canonical.bytes_at(2).as_slice(), "123456789012".as_bytes());
89
90        // Second value is not inlined (13 bytes)
91        assert!(!canonical.views()[3].is_inlined());
92        assert_eq!(canonical.bytes_at(3).as_slice(), "1234567890123".as_bytes());
93    }
94}