vortex_array/arrays/varbin/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use arrow_array::cast::AsArray;
7use arrow_array::{BinaryViewArray, StringViewArray};
8use arrow_schema::DataType;
9use vortex_dtype::DType;
10use vortex_error::VortexExpect;
11
12use crate::arrays::VarBinVTable;
13use crate::arrays::varbin::VarBinArray;
14use crate::arrow::{FromArrowArray, IntoArrowArray};
15use crate::vtable::CanonicalVTable;
16use crate::{ArrayRef, Canonical, ToCanonical};
17
18impl CanonicalVTable<VarBinVTable> for VarBinVTable {
19    fn canonicalize(array: &VarBinArray) -> Canonical {
20        let dtype = array.dtype().clone();
21        let nullable = dtype.is_nullable();
22
23        let array_ref = array
24            .to_array()
25            .into_arrow_preferred()
26            .vortex_expect("VarBinArray must be convertible to arrow array");
27
28        let array = match (&dtype, array_ref.data_type()) {
29            (DType::Utf8(_), DataType::Utf8) => {
30                Arc::new(StringViewArray::from(array_ref.as_string::<i32>()))
31                    as Arc<dyn arrow_array::Array>
32            }
33            (DType::Utf8(_), DataType::LargeUtf8) => {
34                Arc::new(StringViewArray::from(array_ref.as_string::<i64>()))
35                    as Arc<dyn arrow_array::Array>
36            }
37
38            (DType::Binary(_), DataType::Binary) => {
39                Arc::new(BinaryViewArray::from(array_ref.as_binary::<i32>()))
40            }
41            (DType::Binary(_), DataType::LargeBinary) => {
42                Arc::new(BinaryViewArray::from(array_ref.as_binary::<i64>()))
43            }
44            // If its already a view, no need to do anything
45            (DType::Binary(_), DataType::BinaryView) | (DType::Utf8(_), DataType::Utf8View) => {
46                array_ref
47            }
48            _ => unreachable!("VarBinArray must have Utf8 or Binary dtype, instead got: {dtype}",),
49        };
50        Canonical::VarBinView(ArrayRef::from_arrow(array.as_ref(), nullable).to_varbinview())
51    }
52}
53
54#[cfg(test)]
55mod test {
56    use rstest::rstest;
57    use vortex_dtype::{DType, Nullability};
58
59    use crate::arrays::varbin::builder::VarBinBuilder;
60    use crate::canonical::ToCanonical;
61
62    #[rstest]
63    #[case(DType::Utf8(Nullability::Nullable))]
64    #[case(DType::Binary(Nullability::Nullable))]
65    fn test_canonical_varbin(#[case] dtype: DType) {
66        let mut varbin = VarBinBuilder::<i32>::with_capacity(10);
67        varbin.append_null();
68        varbin.append_null();
69        // inlined value
70        varbin.append_value("123456789012".as_bytes());
71        // non-inlinable value
72        varbin.append_value("1234567890123".as_bytes());
73        let varbin = varbin.finish(dtype.clone());
74
75        let canonical = varbin.to_varbinview();
76        assert_eq!(canonical.dtype(), &dtype);
77
78        assert!(!canonical.is_valid(0));
79        assert!(!canonical.is_valid(1));
80
81        // First value is inlined (12 bytes)
82        assert!(canonical.views()[2].is_inlined());
83        assert_eq!(canonical.bytes_at(2).as_slice(), "123456789012".as_bytes());
84
85        // Second value is not inlined (13 bytes)
86        assert!(!canonical.views()[3].is_inlined());
87        assert_eq!(canonical.bytes_at(3).as_slice(), "1234567890123".as_bytes());
88    }
89}