vortex_array/array/varbin/compute/
to_arrow.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
use std::sync::Arc;

use arrow_array::{ArrayRef, GenericBinaryArray, GenericStringArray, OffsetSizeTrait};
use arrow_schema::DataType;
use vortex_dtype::{DType, NativePType, Nullability, PType};
use vortex_error::{vortex_bail, VortexResult};

use crate::array::{VarBinArray, VarBinEncoding};
use crate::compute::{try_cast, ToArrowFn};
use crate::IntoArrayVariant;

impl ToArrowFn<VarBinArray> for VarBinEncoding {
    fn preferred_arrow_data_type(&self, array: &VarBinArray) -> VortexResult<Option<DataType>> {
        let offsets_ptype = PType::try_from(array.offsets().dtype())?;
        Ok(Some(match array.dtype() {
            DType::Utf8(_) => match offsets_ptype {
                PType::I64 | PType::U64 => DataType::LargeUtf8,
                _ => DataType::Utf8,
            },
            DType::Binary(_) => match offsets_ptype {
                PType::I64 | PType::U64 => DataType::LargeBinary,
                _ => DataType::Binary,
            },
            _ => vortex_bail!("Unsupported DType"),
        }))
    }

    fn to_arrow(
        &self,
        array: &VarBinArray,
        data_type: &DataType,
    ) -> VortexResult<Option<ArrayRef>> {
        let array_ref = match data_type {
            DataType::BinaryView | DataType::FixedSizeBinary(_) | DataType::Utf8View => {
                // TODO(ngates): we should support converting VarBin into these Arrow arrays.
                return Ok(None);
            }
            DataType::Binary | DataType::Utf8 => {
                // These are both supported with a zero-copy cast, see below
                varbin_to_arrow::<i32>(array)
            }
            DataType::LargeBinary | DataType::LargeUtf8 => {
                // These are both supported with a zero-copy cast, see below
                varbin_to_arrow::<i64>(array)
            }
            _ => {
                // Everything else is unsupported
                vortex_bail!("Unsupported data type: {data_type}")
            }
        }?;

        Ok(Some(if array_ref.data_type() != data_type {
            arrow_cast::cast(array_ref.as_ref(), data_type)?
        } else {
            array_ref
        }))
    }
}

/// Convert the array to Arrow variable length binary array type.
pub(crate) fn varbin_to_arrow<O: NativePType + OffsetSizeTrait>(
    varbin_array: &VarBinArray,
) -> VortexResult<ArrayRef> {
    let offsets = try_cast(
        varbin_array.offsets(),
        &DType::Primitive(O::PTYPE, Nullability::NonNullable),
    )?
    .into_primitive()
    .map_err(|err| err.with_context("Failed to canonicalize offsets"))?;

    let nulls = varbin_array.logical_validity()?.to_null_buffer();
    let data = varbin_array.bytes();

    // Switch on DType.
    Ok(match varbin_array.dtype() {
        DType::Binary(_) => Arc::new(unsafe {
            GenericBinaryArray::new_unchecked(
                offsets.buffer::<O>().into_arrow_offset_buffer(),
                data.into_arrow_buffer(),
                nulls,
            )
        }),
        DType::Utf8(_) => Arc::new(unsafe {
            GenericStringArray::new_unchecked(
                offsets.buffer::<O>().into_arrow_offset_buffer(),
                data.into_arrow_buffer(),
                nulls,
            )
        }),
        _ => vortex_bail!(
            "expected utf8 or binary instead of {}",
            varbin_array.dtype()
        ),
    })
}