vortex_array/arrays/varbin/compute/
to_arrow.rs

1use std::sync::Arc;
2
3use arrow_array::{ArrayRef, GenericBinaryArray, GenericStringArray, OffsetSizeTrait};
4use arrow_schema::DataType;
5use vortex_dtype::{DType, NativePType, Nullability, PType};
6use vortex_error::{VortexResult, vortex_bail};
7
8use crate::arrays::{VarBinArray, VarBinEncoding};
9use crate::compute::{ToArrowFn, cast};
10use crate::{Array, ToCanonical};
11
12impl ToArrowFn<&VarBinArray> for VarBinEncoding {
13    fn preferred_arrow_data_type(&self, array: &VarBinArray) -> VortexResult<Option<DataType>> {
14        let offsets_ptype = PType::try_from(array.offsets().dtype())?;
15        Ok(Some(match array.dtype() {
16            DType::Utf8(_) => match offsets_ptype {
17                PType::I64 | PType::U64 => DataType::LargeUtf8,
18                _ => DataType::Utf8,
19            },
20            DType::Binary(_) => match offsets_ptype {
21                PType::I64 | PType::U64 => DataType::LargeBinary,
22                _ => DataType::Binary,
23            },
24            _ => vortex_bail!("Unsupported DType"),
25        }))
26    }
27
28    fn to_arrow(
29        &self,
30        array: &VarBinArray,
31        data_type: &DataType,
32    ) -> VortexResult<Option<ArrayRef>> {
33        let array_ref = match data_type {
34            DataType::FixedSizeBinary(_) => {
35                // TODO(ngates): we should support converting VarBin into these Arrow arrays.
36                return Ok(None);
37            }
38            DataType::BinaryView | DataType::Utf8View => Ok(arrow_cast::cast(
39                &*varbin_to_arrow::<i32>(array)?,
40                data_type,
41            )?),
42            DataType::Binary | DataType::Utf8 => {
43                // These are both supported with a zero-copy cast, see below
44                varbin_to_arrow::<i32>(array)
45            }
46            DataType::LargeBinary | DataType::LargeUtf8 => {
47                // These are both supported with a zero-copy cast, see below
48                varbin_to_arrow::<i64>(array)
49            }
50            _ => {
51                // Everything else is unsupported
52                vortex_bail!("Unsupported data type: {data_type}")
53            }
54        }?;
55
56        Ok(Some(if array_ref.data_type() != data_type {
57            arrow_cast::cast(array_ref.as_ref(), data_type)?
58        } else {
59            array_ref
60        }))
61    }
62}
63
64/// Convert the array to Arrow variable length binary array type.
65pub(crate) fn varbin_to_arrow<O: NativePType + OffsetSizeTrait>(
66    varbin_array: &VarBinArray,
67) -> VortexResult<ArrayRef> {
68    let offsets = cast(
69        varbin_array.offsets(),
70        &DType::Primitive(O::PTYPE, Nullability::NonNullable),
71    )?
72    .to_primitive()
73    .map_err(|err| err.with_context("Failed to canonicalize offsets"))?;
74
75    let nulls = varbin_array.validity_mask()?.to_null_buffer();
76    let data = varbin_array.bytes().clone();
77
78    // Switch on DType.
79    Ok(match varbin_array.dtype() {
80        DType::Binary(_) => Arc::new(unsafe {
81            GenericBinaryArray::new_unchecked(
82                offsets.buffer::<O>().into_arrow_offset_buffer(),
83                data.into_arrow_buffer(),
84                nulls,
85            )
86        }),
87        DType::Utf8(_) => Arc::new(unsafe {
88            GenericStringArray::new_unchecked(
89                offsets.buffer::<O>().into_arrow_offset_buffer(),
90                data.into_arrow_buffer(),
91                nulls,
92            )
93        }),
94        _ => vortex_bail!(
95            "expected utf8 or binary instead of {}",
96            varbin_array.dtype()
97        ),
98    })
99}