vortex_array/arrow/compute/to_arrow/
varbin.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use arrow_array::{
7    ArrayRef as ArrowArrayRef, GenericBinaryArray, GenericStringArray, OffsetSizeTrait,
8};
9use arrow_schema::DataType;
10use vortex_dtype::{DType, IntegerPType, Nullability, PType};
11use vortex_error::{VortexResult, vortex_bail, vortex_panic};
12
13use crate::arrays::{VarBinArray, VarBinVTable};
14use crate::arrow::compute::to_arrow::null_buffer::to_null_buffer;
15use crate::arrow::compute::{ToArrowKernel, ToArrowKernelAdapter};
16use crate::compute::cast;
17use crate::{Array, ToCanonical, register_kernel};
18
19impl ToArrowKernel for VarBinVTable {
20    fn to_arrow(
21        &self,
22        array: &VarBinArray,
23        arrow_type: Option<&DataType>,
24    ) -> VortexResult<Option<ArrowArrayRef>> {
25        let offsets_ptype = PType::try_from(array.offsets().dtype())?;
26
27        match arrow_type {
28            // Emit out preferred Arrow VarBin array.
29            None => match array.dtype() {
30                DType::Binary(_) => match offsets_ptype {
31                    PType::I64 | PType::U64 => to_arrow::<i64>(array),
32                    PType::U8 | PType::U16 | PType::U32 | PType::I8 | PType::I16 | PType::I32 => {
33                        to_arrow::<i32>(array)
34                    }
35                    PType::F16 | PType::F32 | PType::F64 => {
36                        vortex_panic!("offsets array were somehow floating point")
37                    }
38                },
39                DType::Utf8(_) => match offsets_ptype {
40                    PType::I64 | PType::U64 => to_arrow::<i64>(array),
41                    PType::U8 | PType::U16 | PType::U32 | PType::I8 | PType::I16 | PType::I32 => {
42                        to_arrow::<i32>(array)
43                    }
44                    PType::F16 | PType::F32 | PType::F64 => {
45                        vortex_panic!("offsets array were somehow floating point")
46                    }
47                },
48                dtype => unreachable!("Unsupported DType {dtype}"),
49            },
50            // Emit the requested Arrow array.
51            Some(DataType::Binary) if array.dtype().is_binary() => to_arrow::<i32>(array),
52            Some(DataType::LargeBinary) if array.dtype().is_binary() => to_arrow::<i64>(array),
53            Some(DataType::Utf8) if array.dtype().is_utf8() => to_arrow::<i32>(array),
54            Some(DataType::LargeUtf8) if array.dtype().is_utf8() => to_arrow::<i64>(array),
55            // Allow fallback to canonicalize to a VarBinView and try again.
56            Some(DataType::BinaryView) | Some(DataType::Utf8View) => {
57                return Ok(None);
58            }
59            // Any other type is not supported.
60            Some(_) => {
61                vortex_bail!("Cannot convert VarBin to Arrow type {arrow_type:?}");
62            }
63        }
64        .map(Some)
65    }
66}
67
68register_kernel!(ToArrowKernelAdapter(VarBinVTable).lift());
69
70fn to_arrow<O: IntegerPType + OffsetSizeTrait>(array: &VarBinArray) -> VortexResult<ArrowArrayRef> {
71    let offsets = cast(
72        array.offsets(),
73        &DType::Primitive(O::PTYPE, Nullability::NonNullable),
74    )?
75    .to_primitive();
76
77    let nulls = to_null_buffer(array.validity_mask());
78    let data = array.bytes().clone();
79
80    // Match on the `DType`.
81    Ok(match array.dtype() {
82        DType::Binary(_) => Arc::new(unsafe {
83            GenericBinaryArray::new_unchecked(
84                offsets.buffer::<O>().into_arrow_offset_buffer(),
85                data.into_arrow_buffer(),
86                nulls,
87            )
88        }),
89        DType::Utf8(_) => Arc::new(unsafe {
90            GenericStringArray::new_unchecked(
91                offsets.buffer::<O>().into_arrow_offset_buffer(),
92                data.into_arrow_buffer(),
93                nulls,
94            )
95        }),
96        dtype => {
97            unreachable!("expected utf8 or binary instead of {dtype}")
98        }
99    })
100}