vortex_array/arrays/varbin/compute/
to_arrow.rs

1use std::sync::Arc;
2
3use arrow_array::{ArrayRef, GenericBinaryArray, GenericStringArray, OffsetSizeTrait};
4use arrow_schema::DataType;
5use vortex_dtype::{DType, NativePType, Nullability, PType};
6use vortex_error::{VortexResult, vortex_bail};
7
8use crate::arrays::{VarBinArray, VarBinEncoding};
9use crate::compute::{ToArrowFn, try_cast};
10use crate::{Array, ToCanonical};
11
12impl ToArrowFn<&VarBinArray> for VarBinEncoding {
13    fn preferred_arrow_data_type(&self, array: &VarBinArray) -> VortexResult<Option<DataType>> {
14        let offsets_ptype = PType::try_from(array.offsets().dtype())?;
15        Ok(Some(match array.dtype() {
16            DType::Utf8(_) => match offsets_ptype {
17                PType::I64 | PType::U64 => DataType::LargeUtf8,
18                _ => DataType::Utf8,
19            },
20            DType::Binary(_) => match offsets_ptype {
21                PType::I64 | PType::U64 => DataType::LargeBinary,
22                _ => DataType::Binary,
23            },
24            _ => vortex_bail!("Unsupported DType"),
25        }))
26    }
27
28    fn to_arrow(
29        &self,
30        array: &VarBinArray,
31        data_type: &DataType,
32    ) -> VortexResult<Option<ArrayRef>> {
33        let array_ref = match data_type {
34            DataType::BinaryView | DataType::FixedSizeBinary(_) | DataType::Utf8View => {
35                // TODO(ngates): we should support converting VarBin into these Arrow arrays.
36                return Ok(None);
37            }
38            DataType::Binary | DataType::Utf8 => {
39                // These are both supported with a zero-copy cast, see below
40                varbin_to_arrow::<i32>(array)
41            }
42            DataType::LargeBinary | DataType::LargeUtf8 => {
43                // These are both supported with a zero-copy cast, see below
44                varbin_to_arrow::<i64>(array)
45            }
46            _ => {
47                // Everything else is unsupported
48                vortex_bail!("Unsupported data type: {data_type}")
49            }
50        }?;
51
52        Ok(Some(if array_ref.data_type() != data_type {
53            arrow_cast::cast(array_ref.as_ref(), data_type)?
54        } else {
55            array_ref
56        }))
57    }
58}
59
60/// Convert the array to Arrow variable length binary array type.
61pub(crate) fn varbin_to_arrow<O: NativePType + OffsetSizeTrait>(
62    varbin_array: &VarBinArray,
63) -> VortexResult<ArrayRef> {
64    let offsets = try_cast(
65        varbin_array.offsets(),
66        &DType::Primitive(O::PTYPE, Nullability::NonNullable),
67    )?
68    .to_primitive()
69    .map_err(|err| err.with_context("Failed to canonicalize offsets"))?;
70
71    let nulls = varbin_array.validity_mask()?.to_null_buffer();
72    let data = varbin_array.bytes().clone();
73
74    // Switch on DType.
75    Ok(match varbin_array.dtype() {
76        DType::Binary(_) => Arc::new(unsafe {
77            GenericBinaryArray::new_unchecked(
78                offsets.buffer::<O>().into_arrow_offset_buffer(),
79                data.into_arrow_buffer(),
80                nulls,
81            )
82        }),
83        DType::Utf8(_) => Arc::new(unsafe {
84            GenericStringArray::new_unchecked(
85                offsets.buffer::<O>().into_arrow_offset_buffer(),
86                data.into_arrow_buffer(),
87                nulls,
88            )
89        }),
90        _ => vortex_bail!(
91            "expected utf8 or binary instead of {}",
92            varbin_array.dtype()
93        ),
94    })
95}