use std::fmt::Display;
use std::fmt::Formatter;
use std::mem::size_of;
use std::sync::Arc;
use vortex_buffer::Alignment;
use vortex_buffer::Buffer;
use vortex_buffer::ByteBuffer;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_ensure;
use vortex_error::vortex_err;
use vortex_error::vortex_panic;
use vortex_mask::Mask;
use crate::ArrayRef;
use crate::array::Array;
use crate::array::ArrayParts;
use crate::array::TypedArrayRef;
use crate::array::child_to_validity;
use crate::array::validity_to_child;
use crate::arrays::VarBinView;
use crate::arrays::varbinview::BinaryView;
use crate::buffer::BufferHandle;
use crate::builders::ArrayBuilder;
use crate::builders::VarBinViewBuilder;
use crate::dtype::DType;
use crate::dtype::Nullability;
use crate::validity::Validity;
pub(super) const VALIDITY_SLOT: usize = 0;
pub(super) const NUM_SLOTS: usize = 1;
pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
#[derive(Clone, Debug)]
pub struct VarBinViewData {
pub(super) buffers: Arc<[BufferHandle]>,
pub(super) views: BufferHandle,
}
impl Display for VarBinViewData {
fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
Ok(())
}
}
pub struct VarBinViewDataParts {
pub dtype: DType,
pub buffers: Arc<[BufferHandle]>,
pub views: BufferHandle,
pub validity: Validity,
}
impl VarBinViewData {
fn dtype_parts(dtype: &DType) -> VortexResult<(bool, Nullability)> {
match dtype {
DType::Utf8(nullability) => Ok((true, *nullability)),
DType::Binary(nullability) => Ok((false, *nullability)),
_ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
}
}
pub(super) fn make_slots(validity: &Validity, len: usize) -> Vec<Option<ArrayRef>> {
vec![validity_to_child(validity, len)]
}
pub fn new(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> Self {
Self::try_new(views, buffers, dtype, validity)
.vortex_expect("VarBinViewArray construction failed")
}
pub fn new_handle(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
validity: Validity,
) -> Self {
Self::try_new_handle(views, buffers, dtype, validity)
.vortex_expect("VarbinViewArray construction failed")
}
pub fn try_new(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> VortexResult<Self> {
Self::validate(&views, &buffers, &dtype, &validity)?;
Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
}
pub fn try_new_handle(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
validity: Validity,
) -> VortexResult<Self> {
let views_nbytes = views.len();
vortex_ensure!(
views_nbytes.is_multiple_of(size_of::<BinaryView>()),
"Expected views buffer length ({views_nbytes}) to be a multiple of {}",
size_of::<BinaryView>()
);
if let Some(host) = views.as_host_opt() {
vortex_ensure!(
host.is_aligned(Alignment::of::<BinaryView>()),
"Views on host must be 16 byte aligned"
);
}
Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
}
pub unsafe fn new_unchecked(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> Self {
#[cfg(debug_assertions)]
Self::validate(&views, &buffers, &dtype, &validity)
.vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
let handles: Vec<BufferHandle> = buffers
.iter()
.cloned()
.map(BufferHandle::new_host)
.collect();
let handles = Arc::from(handles);
let view_handle = BufferHandle::new_host(views.into_byte_buffer());
unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
}
pub unsafe fn new_handle_unchecked(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
_validity: Validity,
) -> Self {
let _ =
Self::dtype_parts(&dtype).vortex_expect("VarBinViewArray dtype must be utf8 or binary");
Self { buffers, views }
}
pub fn validate(
views: &Buffer<BinaryView>,
buffers: &Arc<[ByteBuffer]>,
dtype: &DType,
validity: &Validity,
) -> VortexResult<()> {
vortex_ensure!(
validity.nullability() == dtype.nullability(),
InvalidArgument: "validity {:?} incompatible with nullability {:?}",
validity,
dtype.nullability()
);
match dtype {
DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
simdutf8::basic::from_utf8(string).is_ok()
})?,
DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
_ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
}
Ok(())
}
fn validate_views<F>(
views: &Buffer<BinaryView>,
buffers: &Arc<[ByteBuffer]>,
validity: &Validity,
validator: F,
) -> VortexResult<()>
where
F: Fn(&[u8]) -> bool,
{
for (idx, &view) in views.iter().enumerate() {
if validity.is_null(idx)? {
continue;
}
if view.is_inlined() {
let bytes = &view.as_inlined().data[..view.len() as usize];
vortex_ensure!(
validator(bytes),
InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
);
} else {
let view = view.as_view();
let buf_index = view.buffer_index as usize;
let start_offset = view.offset as usize;
let end_offset = start_offset.saturating_add(view.size as usize);
let buf = buffers.get(buf_index).ok_or_else(||
vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewData with {} buffers",
buffers.len()))?;
vortex_ensure!(
start_offset < buf.len(),
InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
buf.len(),
);
vortex_ensure!(
end_offset <= buf.len(),
InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
buf.len(),
);
let bytes = &buf[start_offset..end_offset];
vortex_ensure!(
view.prefix == bytes[..4],
InvalidArgument: "VarBinView prefix does not match full string"
);
vortex_ensure!(
validator(bytes),
InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
);
}
}
Ok(())
}
pub fn len(&self) -> usize {
self.views.len() / size_of::<BinaryView>()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
#[inline]
pub fn views(&self) -> &[BinaryView] {
let host_views = self.views.as_host();
let len = host_views.len() / size_of::<BinaryView>();
unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
}
pub fn views_handle(&self) -> &BufferHandle {
&self.views
}
#[inline]
pub fn bytes_at(&self, index: usize) -> ByteBuffer {
let views = self.views();
let view = &views[index];
if !view.is_inlined() {
let view_ref = view.as_view();
self.buffer(view_ref.buffer_index as usize)
.slice(view_ref.as_range())
} else {
self.views_handle()
.as_host()
.clone()
.into_byte_buffer()
.slice_ref(view.as_inlined().value())
}
}
#[inline]
pub fn buffer(&self, idx: usize) -> &ByteBuffer {
if idx >= self.data_buffers().len() {
vortex_panic!(
"{idx} buffer index out of bounds, there are {} buffers",
self.data_buffers().len()
);
}
self.buffers[idx].as_host()
}
#[inline]
pub fn data_buffers(&self) -> &Arc<[BufferHandle]> {
&self.buffers
}
#[expect(
clippy::same_name_method,
reason = "intentionally named from_iter like Iterator::from_iter"
)]
pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
iter: I,
dtype: DType,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
for item in iter {
match item {
None => builder.append_null(),
Some(v) => builder.append_value(v),
}
}
builder.finish_into_varbinview().into_data()
}
pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Utf8(Nullability::NonNullable),
iter.size_hint().0,
);
for item in iter {
builder.append_value(item.as_ref());
}
builder.finish_into_varbinview().into_data()
}
pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
iter: I,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Utf8(Nullability::Nullable),
iter.size_hint().0,
);
for item in iter {
match item {
None => builder.append_null(),
Some(v) => builder.append_value(v.as_ref()),
}
}
builder.finish_into_varbinview().into_data()
}
pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Binary(Nullability::NonNullable),
iter.size_hint().0,
);
for item in iter {
builder.append_value(item.as_ref());
}
builder.finish_into_varbinview().into_data()
}
pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
iter: I,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Binary(Nullability::Nullable),
iter.size_hint().0,
);
for item in iter {
match item {
None => builder.append_null(),
Some(v) => builder.append_value(v.as_ref()),
}
}
builder.finish_into_varbinview().into_data()
}
}
pub trait VarBinViewArrayExt: TypedArrayRef<VarBinView> {
fn dtype_parts(&self) -> (bool, Nullability) {
match self.as_ref().dtype() {
DType::Utf8(nullability) => (true, *nullability),
DType::Binary(nullability) => (false, *nullability),
_ => unreachable!("VarBinViewArrayExt requires a utf8 or binary dtype"),
}
}
fn varbinview_validity(&self) -> Validity {
child_to_validity(&self.as_ref().slots()[VALIDITY_SLOT], self.dtype_parts().1)
}
fn varbinview_validity_mask(&self) -> Mask {
self.varbinview_validity().to_mask(self.as_ref().len())
}
}
impl<T: TypedArrayRef<VarBinView>> VarBinViewArrayExt for T {}
impl Array<VarBinView> {
#[inline]
fn from_prevalidated_data(
dtype: DType,
data: VarBinViewData,
slots: Vec<Option<ArrayRef>>,
) -> Self {
let len = data.len();
unsafe {
Array::from_parts_unchecked(
ArrayParts::new(VarBinView, dtype, len, data).with_slots(slots),
)
}
}
#[expect(
clippy::same_name_method,
reason = "intentionally named from_iter like Iterator::from_iter"
)]
pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
iter: I,
dtype: DType,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
for value in iter {
match value {
Some(value) => builder.append_value(value),
None => builder.append_null(),
}
}
builder.finish_into_varbinview()
}
pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Utf8(Nullability::NonNullable),
iter.size_hint().0,
);
for value in iter {
builder.append_value(value.as_ref());
}
builder.finish_into_varbinview()
}
pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
iter: I,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Utf8(Nullability::Nullable),
iter.size_hint().0,
);
for value in iter {
match value {
Some(value) => builder.append_value(value.as_ref()),
None => builder.append_null(),
}
}
builder.finish_into_varbinview()
}
pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Binary(Nullability::NonNullable),
iter.size_hint().0,
);
for value in iter {
builder.append_value(value.as_ref());
}
builder.finish_into_varbinview()
}
pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
iter: I,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Binary(Nullability::Nullable),
iter.size_hint().0,
);
for value in iter {
match value {
Some(value) => builder.append_value(value.as_ref()),
None => builder.append_null(),
}
}
builder.finish_into_varbinview()
}
pub fn try_new(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> VortexResult<Self> {
let data = VarBinViewData::try_new(views, buffers, dtype.clone(), validity.clone())?;
let slots = VarBinViewData::make_slots(&validity, data.len());
Ok(Self::from_prevalidated_data(dtype, data, slots))
}
pub unsafe fn new_unchecked(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> Self {
let data = unsafe {
VarBinViewData::new_unchecked(views, buffers, dtype.clone(), validity.clone())
};
let slots = VarBinViewData::make_slots(&validity, data.len());
Self::from_prevalidated_data(dtype, data, slots)
}
pub fn new_handle(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
validity: Validity,
) -> Self {
let data = VarBinViewData::new_handle(views, buffers, dtype.clone(), validity.clone());
let slots = VarBinViewData::make_slots(&validity, data.len());
Self::from_prevalidated_data(dtype, data, slots)
}
pub unsafe fn new_handle_unchecked(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
validity: Validity,
) -> Self {
let data = unsafe {
VarBinViewData::new_handle_unchecked(views, buffers, dtype.clone(), validity.clone())
};
let slots = VarBinViewData::make_slots(&validity, data.len());
Self::from_prevalidated_data(dtype, data, slots)
}
pub fn into_data_parts(self) -> VarBinViewDataParts {
let dtype = self.dtype().clone();
let validity = self.varbinview_validity();
let data = self.into_data();
VarBinViewDataParts {
dtype,
buffers: data.buffers,
views: data.views,
validity,
}
}
}
impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewData {
fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
Self::from_iter_nullable_bin(iter)
}
}
impl FromIterator<Option<Vec<u8>>> for VarBinViewData {
fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
Self::from_iter_nullable_bin(iter)
}
}
impl FromIterator<Option<String>> for VarBinViewData {
fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
Self::from_iter_nullable_str(iter)
}
}
impl<'a> FromIterator<Option<&'a str>> for VarBinViewData {
fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
Self::from_iter_nullable_str(iter)
}
}
impl<'a> FromIterator<Option<&'a [u8]>> for Array<VarBinView> {
fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
Self::from_iter(iter, DType::Binary(Nullability::Nullable))
}
}
impl FromIterator<Option<Vec<u8>>> for Array<VarBinView> {
fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
Self::from_iter(iter, DType::Binary(Nullability::Nullable))
}
}
impl FromIterator<Option<String>> for Array<VarBinView> {
fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
Self::from_iter_nullable_str(iter)
}
}
impl<'a> FromIterator<Option<&'a str>> for Array<VarBinView> {
fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
Self::from_iter_nullable_str(iter)
}
}