vortex_array/arrays/varbin/
canonical.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use arrow_array::cast::AsArray;
7use arrow_array::{BinaryViewArray, StringViewArray};
8use arrow_schema::DataType;
9use vortex_dtype::DType;
10use vortex_error::VortexResult;
11
12use crate::arrays::VarBinVTable;
13use crate::arrays::varbin::VarBinArray;
14use crate::arrow::{FromArrowArray, IntoArrowArray};
15use crate::vtable::CanonicalVTable;
16use crate::{ArrayRef, Canonical, ToCanonical};
17
18impl CanonicalVTable<VarBinVTable> for VarBinVTable {
19    fn canonicalize(array: &VarBinArray) -> VortexResult<Canonical> {
20        let dtype = array.dtype().clone();
21        let nullable = dtype.is_nullable();
22
23        let array_ref = array.to_array().into_arrow_preferred()?;
24
25        let array = match (&dtype, array_ref.data_type()) {
26            (DType::Utf8(_), DataType::Utf8) => {
27                Arc::new(StringViewArray::from(array_ref.as_string::<i32>()))
28                    as Arc<dyn arrow_array::Array>
29            }
30            (DType::Utf8(_), DataType::LargeUtf8) => {
31                Arc::new(StringViewArray::from(array_ref.as_string::<i64>()))
32                    as Arc<dyn arrow_array::Array>
33            }
34
35            (DType::Binary(_), DataType::Binary) => {
36                Arc::new(BinaryViewArray::from(array_ref.as_binary::<i32>()))
37            }
38            (DType::Binary(_), DataType::LargeBinary) => {
39                Arc::new(BinaryViewArray::from(array_ref.as_binary::<i64>()))
40            }
41            // If its already a view, no need to do anything
42            (DType::Binary(_), DataType::BinaryView) | (DType::Utf8(_), DataType::Utf8View) => {
43                array_ref
44            }
45            _ => unreachable!("VarBinArray must have Utf8 or Binary dtype, instead got: {dtype}",),
46        };
47        Ok(Canonical::VarBinView(
48            ArrayRef::from_arrow(array.as_ref(), nullable).to_varbinview()?,
49        ))
50    }
51}
52
53#[cfg(test)]
54mod test {
55    use rstest::rstest;
56    use vortex_dtype::{DType, Nullability};
57
58    use crate::arrays::varbin::builder::VarBinBuilder;
59    use crate::canonical::ToCanonical;
60
61    #[rstest]
62    #[case(DType::Utf8(Nullability::Nullable))]
63    #[case(DType::Binary(Nullability::Nullable))]
64    fn test_canonical_varbin(#[case] dtype: DType) {
65        let mut varbin = VarBinBuilder::<i32>::with_capacity(10);
66        varbin.append_null();
67        varbin.append_null();
68        // inlined value
69        varbin.append_value("123456789012".as_bytes());
70        // non-inlinable value
71        varbin.append_value("1234567890123".as_bytes());
72        let varbin = varbin.finish(dtype.clone());
73
74        let canonical = varbin.to_varbinview().unwrap();
75        assert_eq!(canonical.dtype(), &dtype);
76
77        assert!(!canonical.is_valid(0).unwrap());
78        assert!(!canonical.is_valid(1).unwrap());
79
80        // First value is inlined (12 bytes)
81        assert!(canonical.views()[2].is_inlined());
82        assert_eq!(canonical.bytes_at(2).as_slice(), "123456789012".as_bytes());
83
84        // Second value is not inlined (13 bytes)
85        assert!(!canonical.views()[3].is_inlined());
86        assert_eq!(canonical.bytes_at(3).as_slice(), "1234567890123".as_bytes());
87    }
88}