vortex_array/arrow/compute/to_arrow/
varbin.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use arrow_array::ArrayRef as ArrowArrayRef;
7use arrow_array::GenericBinaryArray;
8use arrow_array::GenericStringArray;
9use arrow_array::OffsetSizeTrait;
10use arrow_schema::DataType;
11use vortex_dtype::DType;
12use vortex_dtype::IntegerPType;
13use vortex_dtype::Nullability;
14use vortex_dtype::PType;
15use vortex_error::VortexResult;
16use vortex_error::vortex_bail;
17use vortex_error::vortex_panic;
18
19use crate::Array;
20use crate::ToCanonical;
21use crate::arrays::VarBinArray;
22use crate::arrays::VarBinVTable;
23use crate::arrow::compute::ToArrowKernel;
24use crate::arrow::compute::ToArrowKernelAdapter;
25use crate::arrow::compute::to_arrow::null_buffer::to_null_buffer;
26use crate::compute::cast;
27use crate::register_kernel;
28
29impl ToArrowKernel for VarBinVTable {
30    fn to_arrow(
31        &self,
32        array: &VarBinArray,
33        arrow_type: Option<&DataType>,
34    ) -> VortexResult<Option<ArrowArrayRef>> {
35        let offsets_ptype = PType::try_from(array.offsets().dtype())?;
36
37        match arrow_type {
38            // Emit out preferred Arrow VarBin array.
39            None => match array.dtype() {
40                DType::Binary(_) => match offsets_ptype {
41                    PType::I64 | PType::U64 => to_arrow::<i64>(array),
42                    PType::U8 | PType::U16 | PType::U32 | PType::I8 | PType::I16 | PType::I32 => {
43                        to_arrow::<i32>(array)
44                    }
45                    PType::F16 | PType::F32 | PType::F64 => {
46                        vortex_panic!("offsets array were somehow floating point")
47                    }
48                },
49                DType::Utf8(_) => match offsets_ptype {
50                    PType::I64 | PType::U64 => to_arrow::<i64>(array),
51                    PType::U8 | PType::U16 | PType::U32 | PType::I8 | PType::I16 | PType::I32 => {
52                        to_arrow::<i32>(array)
53                    }
54                    PType::F16 | PType::F32 | PType::F64 => {
55                        vortex_panic!("offsets array were somehow floating point")
56                    }
57                },
58                dtype => unreachable!("Unsupported DType {dtype}"),
59            },
60            // Emit the requested Arrow array.
61            Some(DataType::Binary) if array.dtype().is_binary() => to_arrow::<i32>(array),
62            Some(DataType::LargeBinary) if array.dtype().is_binary() => to_arrow::<i64>(array),
63            Some(DataType::Utf8) if array.dtype().is_utf8() => to_arrow::<i32>(array),
64            Some(DataType::LargeUtf8) if array.dtype().is_utf8() => to_arrow::<i64>(array),
65            // Allow fallback to canonicalize to a VarBinView and try again.
66            Some(DataType::BinaryView) | Some(DataType::Utf8View) => {
67                return Ok(None);
68            }
69            // Any other type is not supported.
70            Some(_) => {
71                vortex_bail!("Cannot convert VarBin to Arrow type {arrow_type:?}");
72            }
73        }
74        .map(Some)
75    }
76}
77
78register_kernel!(ToArrowKernelAdapter(VarBinVTable).lift());
79
80fn to_arrow<O: IntegerPType + OffsetSizeTrait>(array: &VarBinArray) -> VortexResult<ArrowArrayRef> {
81    let offsets = cast(
82        array.offsets(),
83        &DType::Primitive(O::PTYPE, Nullability::NonNullable),
84    )?
85    .to_primitive();
86
87    let nulls = to_null_buffer(array.validity_mask());
88    let data = array.bytes().clone();
89
90    // Match on the `DType`.
91    Ok(match array.dtype() {
92        DType::Binary(_) => Arc::new(unsafe {
93            GenericBinaryArray::new_unchecked(
94                offsets.buffer::<O>().into_arrow_offset_buffer(),
95                data.into_arrow_buffer(),
96                nulls,
97            )
98        }),
99        DType::Utf8(_) => Arc::new(unsafe {
100            GenericStringArray::new_unchecked(
101                offsets.buffer::<O>().into_arrow_offset_buffer(),
102                data.into_arrow_buffer(),
103                nulls,
104            )
105        }),
106        dtype => {
107            unreachable!("expected utf8 or binary instead of {dtype}")
108        }
109    })
110}