use std::sync::Arc;
use arrow_array::ArrayRef as ArrowArrayRef;
use arrow_array::GenericByteArray;
use arrow_array::types::BinaryViewType;
use arrow_array::types::ByteArrayType;
use arrow_array::types::StringViewType;
use vortex_error::VortexError;
use vortex_error::VortexResult;
use crate::ArrayRef;
use crate::Canonical;
use crate::ExecutionCtx;
use crate::array::ArrayView;
use crate::arrays::VarBin;
use crate::arrays::VarBinViewArray;
use crate::arrays::varbin::VarBinArrayExt;
use crate::arrow::byte_view::execute_varbinview_to_arrow;
use crate::arrow::executor::validity::to_arrow_null_buffer;
use crate::builtins::ArrayBuiltins;
use crate::dtype::DType;
use crate::dtype::NativePType;
use crate::dtype::Nullability;
pub(super) fn to_arrow_byte_array<T: ByteArrayType>(
array: ArrayRef,
ctx: &mut ExecutionCtx,
) -> VortexResult<ArrowArrayRef>
where
T::Offset: NativePType,
{
if let Some(array) = array.as_opt::<VarBin>() {
return varbin_to_byte_array::<T>(array, ctx);
}
let varbinview = array.execute::<VarBinViewArray>(ctx)?;
let binary_view = match varbinview.dtype() {
DType::Utf8(_) => execute_varbinview_to_arrow::<StringViewType>(&varbinview, ctx),
DType::Binary(_) => execute_varbinview_to_arrow::<BinaryViewType>(&varbinview, ctx),
_ => unreachable!("VarBinViewArray must have Utf8 or Binary dtype"),
}?;
arrow_cast::cast(&binary_view, &T::DATA_TYPE).map_err(VortexError::from)
}
fn varbin_to_byte_array<T: ByteArrayType>(
array: ArrayView<'_, VarBin>,
ctx: &mut ExecutionCtx,
) -> VortexResult<ArrowArrayRef>
where
T::Offset: NativePType,
{
let offsets = array
.offsets()
.cast(DType::Primitive(T::Offset::PTYPE, Nullability::NonNullable))?
.execute::<Canonical>(ctx)?
.into_primitive()
.to_buffer::<T::Offset>()
.into_arrow_offset_buffer();
let data = array.bytes().clone().into_arrow_buffer();
let null_buffer = to_arrow_null_buffer(array.validity()?, array.len(), ctx)?;
Ok(Arc::new(unsafe {
GenericByteArray::<T>::new_unchecked(offsets, data, null_buffer)
}))
}
#[cfg(test)]
mod tests {
use arrow_array::Array;
use arrow_array::cast::AsArray;
use arrow_schema::DataType;
use rstest::rstest;
use crate::IntoArray;
use crate::LEGACY_SESSION;
use crate::VortexSessionExecute;
use crate::arrow::ArrowArrayExecutor;
use crate::arrow::executor::byte::VarBinViewArray;
use crate::dtype::DType;
use crate::dtype::Nullability;
fn make_utf8_array() -> VarBinViewArray {
VarBinViewArray::from_iter_str(["hello", "world", "this is a longer string for testing"])
}
fn make_binary_array() -> VarBinViewArray {
VarBinViewArray::from_iter_bin([
b"hello".as_slice(),
b"world".as_slice(),
b"this is a longer string for testing".as_slice(),
])
}
#[rstest]
#[case::utf8_to_binary(make_utf8_array(), DataType::Binary)]
#[case::utf8_to_large_binary(make_utf8_array(), DataType::LargeBinary)]
#[case::utf8_to_utf8(make_utf8_array(), DataType::Utf8)]
#[case::utf8_to_large_utf8(make_utf8_array(), DataType::LargeUtf8)]
#[case::utf8_to_utf8_view(make_utf8_array(), DataType::Utf8View)]
#[case::binary_to_binary(make_binary_array(), DataType::Binary)]
#[case::binary_to_large_binary(make_binary_array(), DataType::LargeBinary)]
#[case::binary_to_utf8(make_binary_array(), DataType::Utf8)]
#[case::binary_to_large_utf8(make_binary_array(), DataType::LargeUtf8)]
#[case::binary_to_binary_view(make_binary_array(), DataType::BinaryView)]
fn test_vortex_string_binary_to_arrow(
#[case] vortex_array: VarBinViewArray,
#[case] target_dtype: DataType,
) {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let arrow = vortex_array
.into_array()
.execute_arrow(Some(&target_dtype), &mut ctx)
.unwrap();
assert_eq!(arrow.data_type(), &target_dtype);
assert_eq!(arrow.len(), 3);
assert_eq!(arrow.null_count(), 0);
let expected: Vec<&[u8]> = vec![b"hello", b"world", b"this is a longer string for testing"];
for (i, expected_bytes) in expected.iter().enumerate() {
let actual_bytes: &[u8] = match &target_dtype {
DataType::Binary => arrow.as_binary::<i32>().value(i),
DataType::LargeBinary => arrow.as_binary::<i64>().value(i),
DataType::Utf8 => arrow.as_string::<i32>().value(i).as_bytes(),
DataType::LargeUtf8 => arrow.as_string::<i64>().value(i).as_bytes(),
DataType::BinaryView => arrow.as_binary_view().value(i),
DataType::Utf8View => arrow.as_string_view().value(i).as_bytes(),
_ => unreachable!(),
};
assert_eq!(actual_bytes, *expected_bytes, "Mismatch at index {i}");
}
}
#[rstest]
#[case::utf8_to_binary(DType::Utf8(Nullability::Nullable), DataType::Binary)]
#[case::utf8_to_large_binary(DType::Utf8(Nullability::Nullable), DataType::LargeBinary)]
#[case::utf8_to_utf8(DType::Utf8(Nullability::Nullable), DataType::Utf8)]
#[case::utf8_to_large_utf8(DType::Utf8(Nullability::Nullable), DataType::LargeUtf8)]
#[case::utf8_to_utf8_view(DType::Utf8(Nullability::Nullable), DataType::Utf8View)]
#[case::binary_to_binary(DType::Binary(Nullability::Nullable), DataType::Binary)]
#[case::binary_to_large_binary(DType::Binary(Nullability::Nullable), DataType::LargeBinary)]
#[case::binary_to_utf8(DType::Binary(Nullability::Nullable), DataType::Utf8)]
#[case::binary_to_large_utf8(DType::Binary(Nullability::Nullable), DataType::LargeUtf8)]
#[case::binary_to_binary_view(DType::Binary(Nullability::Nullable), DataType::BinaryView)]
fn test_nullable_vortex_string_binary_to_arrow(
#[case] vortex_dtype: DType,
#[case] target_dtype: DataType,
) {
let vortex_array = VarBinViewArray::from_iter(
[Some(b"hello".as_slice()), None, Some(b"world".as_slice())],
vortex_dtype,
);
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let arrow = vortex_array
.into_array()
.execute_arrow(Some(&target_dtype), &mut ctx)
.unwrap();
assert_eq!(arrow.data_type(), &target_dtype);
assert_eq!(arrow.len(), 3);
assert_eq!(arrow.null_count(), 1);
assert!(!arrow.is_null(0));
assert!(arrow.is_null(1));
assert!(!arrow.is_null(2));
}
}