use std::sync::Arc;
use vortex_buffer::Alignment;
use vortex_buffer::Buffer;
use vortex_buffer::ByteBuffer;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_ensure;
use vortex_error::vortex_err;
use vortex_error::vortex_panic;
use crate::arrays::varbinview::BinaryView;
use crate::buffer::BufferHandle;
use crate::builders::ArrayBuilder;
use crate::builders::VarBinViewBuilder;
use crate::dtype::DType;
use crate::dtype::Nullability;
use crate::stats::ArrayStats;
use crate::validity::Validity;
#[derive(Clone, Debug)]
pub struct VarBinViewArray {
pub(super) dtype: DType,
pub(super) buffers: Arc<[BufferHandle]>,
pub(super) views: BufferHandle,
pub(super) validity: Validity,
pub(super) stats_set: ArrayStats,
}
pub struct VarBinViewArrayParts {
pub dtype: DType,
pub buffers: Arc<[BufferHandle]>,
pub views: BufferHandle,
pub validity: Validity,
}
impl VarBinViewArray {
pub fn new(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> Self {
Self::try_new(views, buffers, dtype, validity)
.vortex_expect("VarBinViewArray construction failed")
}
pub fn new_handle(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
validity: Validity,
) -> Self {
Self::try_new_handle(views, buffers, dtype, validity)
.vortex_expect("VarbinViewArray construction failed")
}
pub fn try_new(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> VortexResult<Self> {
Self::validate(&views, &buffers, &dtype, &validity)?;
Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
}
pub fn try_new_handle(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
validity: Validity,
) -> VortexResult<Self> {
let views_nbytes = views.len();
vortex_ensure!(
views_nbytes.is_multiple_of(size_of::<BinaryView>()),
"Expected views buffer length ({views_nbytes}) to be a multiple of {}",
size_of::<BinaryView>()
);
if let Some(host) = views.as_host_opt() {
vortex_ensure!(
host.is_aligned(Alignment::of::<BinaryView>()),
"Views on host must be 16 byte aligned"
);
}
Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
}
pub unsafe fn new_unchecked(
views: Buffer<BinaryView>,
buffers: Arc<[ByteBuffer]>,
dtype: DType,
validity: Validity,
) -> Self {
#[cfg(debug_assertions)]
Self::validate(&views, &buffers, &dtype, &validity)
.vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
let handles: Vec<BufferHandle> = buffers
.iter()
.cloned()
.map(BufferHandle::new_host)
.collect();
let handles = Arc::from(handles);
let view_handle = BufferHandle::new_host(views.into_byte_buffer());
unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
}
pub unsafe fn new_handle_unchecked(
views: BufferHandle,
buffers: Arc<[BufferHandle]>,
dtype: DType,
validity: Validity,
) -> Self {
Self {
views,
buffers,
dtype,
validity,
stats_set: Default::default(),
}
}
pub fn validate(
views: &Buffer<BinaryView>,
buffers: &Arc<[ByteBuffer]>,
dtype: &DType,
validity: &Validity,
) -> VortexResult<()> {
vortex_ensure!(
validity.nullability() == dtype.nullability(),
InvalidArgument: "validity {:?} incompatible with nullability {:?}",
validity,
dtype.nullability()
);
match dtype {
DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
simdutf8::basic::from_utf8(string).is_ok()
})?,
DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
_ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
}
Ok(())
}
fn validate_views<F>(
views: &Buffer<BinaryView>,
buffers: &Arc<[ByteBuffer]>,
validity: &Validity,
validator: F,
) -> VortexResult<()>
where
F: Fn(&[u8]) -> bool,
{
for (idx, &view) in views.iter().enumerate() {
if validity.is_null(idx)? {
continue;
}
if view.is_inlined() {
let bytes = &view.as_inlined().data[..view.len() as usize];
vortex_ensure!(
validator(bytes),
InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
);
} else {
let view = view.as_view();
let buf_index = view.buffer_index as usize;
let start_offset = view.offset as usize;
let end_offset = start_offset.saturating_add(view.size as usize);
let buf = buffers.get(buf_index).ok_or_else(||
vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
buffers.len()))?;
vortex_ensure!(
start_offset < buf.len(),
InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
buf.len(),
);
vortex_ensure!(
end_offset <= buf.len(),
InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
buf.len(),
);
let bytes = &buf[start_offset..end_offset];
vortex_ensure!(
view.prefix == bytes[..4],
InvalidArgument: "VarBinView prefix does not match full string"
);
vortex_ensure!(
validator(bytes),
InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
);
}
}
Ok(())
}
pub fn into_parts(self) -> VarBinViewArrayParts {
VarBinViewArrayParts {
dtype: self.dtype,
buffers: self.buffers,
views: self.views,
validity: self.validity,
}
}
pub fn nbuffers(&self) -> usize {
self.buffers.len()
}
#[inline]
pub fn views(&self) -> &[BinaryView] {
let host_views = self.views.as_host();
let len = host_views.len() / size_of::<BinaryView>();
unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
}
pub fn views_handle(&self) -> &BufferHandle {
&self.views
}
#[inline]
pub fn bytes_at(&self, index: usize) -> ByteBuffer {
let views = self.views();
let view = &views[index];
if !view.is_inlined() {
let view_ref = view.as_view();
self.buffer(view_ref.buffer_index as usize)
.slice(view_ref.as_range())
} else {
self.views_handle()
.as_host()
.clone()
.into_byte_buffer()
.slice_ref(view.as_inlined().value())
}
}
#[inline]
pub fn buffer(&self, idx: usize) -> &ByteBuffer {
if idx >= self.nbuffers() {
vortex_panic!(
"{idx} buffer index out of bounds, there are {} buffers",
self.nbuffers()
);
}
self.buffers[idx].as_host()
}
#[inline]
pub fn buffers(&self) -> &Arc<[BufferHandle]> {
&self.buffers
}
#[expect(
clippy::same_name_method,
reason = "intentionally named from_iter like Iterator::from_iter"
)]
pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
iter: I,
dtype: DType,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
for item in iter {
match item {
None => builder.append_null(),
Some(v) => builder.append_value(v),
}
}
builder.finish_into_varbinview()
}
pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Utf8(Nullability::NonNullable),
iter.size_hint().0,
);
for item in iter {
builder.append_value(item.as_ref());
}
builder.finish_into_varbinview()
}
pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
iter: I,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Utf8(Nullability::Nullable),
iter.size_hint().0,
);
for item in iter {
match item {
None => builder.append_null(),
Some(v) => builder.append_value(v.as_ref()),
}
}
builder.finish_into_varbinview()
}
pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Binary(Nullability::NonNullable),
iter.size_hint().0,
);
for item in iter {
builder.append_value(item.as_ref());
}
builder.finish_into_varbinview()
}
pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
iter: I,
) -> Self {
let iter = iter.into_iter();
let mut builder = VarBinViewBuilder::with_capacity(
DType::Binary(Nullability::Nullable),
iter.size_hint().0,
);
for item in iter {
match item {
None => builder.append_null(),
Some(v) => builder.append_value(v.as_ref()),
}
}
builder.finish_into_varbinview()
}
}
impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
Self::from_iter_nullable_bin(iter)
}
}
impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
Self::from_iter_nullable_bin(iter)
}
}
impl FromIterator<Option<String>> for VarBinViewArray {
fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
Self::from_iter_nullable_str(iter)
}
}
impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
Self::from_iter_nullable_str(iter)
}
}