1use std::fmt::Display;
5use std::fmt::Formatter;
6use std::mem::size_of;
7use std::sync::Arc;
8
9use vortex_buffer::Alignment;
10use vortex_buffer::Buffer;
11use vortex_buffer::ByteBuffer;
12use vortex_error::VortexExpect;
13use vortex_error::VortexResult;
14use vortex_error::vortex_bail;
15use vortex_error::vortex_ensure;
16use vortex_error::vortex_err;
17use vortex_error::vortex_panic;
18
19use crate::ArrayRef;
20use crate::array::Array;
21use crate::array::ArrayParts;
22use crate::array::TypedArrayRef;
23use crate::array::child_to_validity;
24use crate::array::validity_to_child;
25use crate::arrays::VarBinView;
26use crate::arrays::varbinview::BinaryView;
27use crate::buffer::BufferHandle;
28use crate::builders::ArrayBuilder;
29use crate::builders::VarBinViewBuilder;
30use crate::dtype::DType;
31use crate::dtype::Nullability;
32use crate::validity::Validity;
33
34pub(super) const VALIDITY_SLOT: usize = 0;
36pub(super) const NUM_SLOTS: usize = 1;
37pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
38
39#[derive(Clone, Debug)]
99pub struct VarBinViewData {
100 pub(super) buffers: Arc<[BufferHandle]>,
101 pub(super) views: BufferHandle,
102}
103
104impl Display for VarBinViewData {
105 fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
106 Ok(())
107 }
108}
109
110pub struct VarBinViewDataParts {
111 pub dtype: DType,
112 pub buffers: Arc<[BufferHandle]>,
113 pub views: BufferHandle,
114 pub validity: Validity,
115}
116
117impl VarBinViewData {
118 fn dtype_parts(dtype: &DType) -> VortexResult<(bool, Nullability)> {
119 match dtype {
120 DType::Utf8(nullability) => Ok((true, *nullability)),
121 DType::Binary(nullability) => Ok((false, *nullability)),
122 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
123 }
124 }
125
126 pub(super) fn make_slots(validity: &Validity, len: usize) -> Vec<Option<ArrayRef>> {
128 vec![validity_to_child(validity, len)]
129 }
130
131 pub fn new(
138 views: Buffer<BinaryView>,
139 buffers: Arc<[ByteBuffer]>,
140 dtype: DType,
141 validity: Validity,
142 ) -> Self {
143 Self::try_new(views, buffers, dtype, validity)
144 .vortex_expect("VarBinViewArray construction failed")
145 }
146
147 pub fn new_handle(
154 views: BufferHandle,
155 buffers: Arc<[BufferHandle]>,
156 dtype: DType,
157 validity: Validity,
158 ) -> Self {
159 Self::try_new_handle(views, buffers, dtype, validity)
160 .vortex_expect("VarbinViewArray construction failed")
161 }
162
163 pub fn try_new(
172 views: Buffer<BinaryView>,
173 buffers: Arc<[ByteBuffer]>,
174 dtype: DType,
175 validity: Validity,
176 ) -> VortexResult<Self> {
177 Self::validate(&views, &buffers, &dtype, &validity)?;
178
179 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
181 }
182
183 pub fn try_new_handle(
192 views: BufferHandle,
193 buffers: Arc<[BufferHandle]>,
194 dtype: DType,
195 validity: Validity,
196 ) -> VortexResult<Self> {
197 let views_nbytes = views.len();
198 vortex_ensure!(
199 views_nbytes.is_multiple_of(size_of::<BinaryView>()),
200 "Expected views buffer length ({views_nbytes}) to be a multiple of {}",
201 size_of::<BinaryView>()
202 );
203
204 if let Some(host) = views.as_host_opt() {
206 vortex_ensure!(
207 host.is_aligned(Alignment::of::<BinaryView>()),
208 "Views on host must be 16 byte aligned"
209 );
210 }
211
212 Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
214 }
215
216 pub unsafe fn new_unchecked(
246 views: Buffer<BinaryView>,
247 buffers: Arc<[ByteBuffer]>,
248 dtype: DType,
249 validity: Validity,
250 ) -> Self {
251 #[cfg(debug_assertions)]
252 Self::validate(&views, &buffers, &dtype, &validity)
253 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
254
255 let handles: Vec<BufferHandle> = buffers
256 .iter()
257 .cloned()
258 .map(BufferHandle::new_host)
259 .collect();
260
261 let handles = Arc::from(handles);
262 let view_handle = BufferHandle::new_host(views.into_byte_buffer());
263 unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
264 }
265
266 pub unsafe fn new_handle_unchecked(
272 views: BufferHandle,
273 buffers: Arc<[BufferHandle]>,
274 dtype: DType,
275 _validity: Validity,
276 ) -> Self {
277 let _ =
278 Self::dtype_parts(&dtype).vortex_expect("VarBinViewArray dtype must be utf8 or binary");
279 Self { buffers, views }
280 }
281
282 pub fn validate(
286 views: &Buffer<BinaryView>,
287 buffers: &Arc<[ByteBuffer]>,
288 dtype: &DType,
289 validity: &Validity,
290 ) -> VortexResult<()> {
291 vortex_ensure!(
292 validity.nullability() == dtype.nullability(),
293 InvalidArgument: "validity {:?} incompatible with nullability {:?}",
294 validity,
295 dtype.nullability()
296 );
297
298 match dtype {
299 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
300 simdutf8::basic::from_utf8(string).is_ok()
301 })?,
302 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
303 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
304 }
305
306 Ok(())
307 }
308
309 fn validate_views<F>(
310 views: &Buffer<BinaryView>,
311 buffers: &Arc<[ByteBuffer]>,
312 validity: &Validity,
313 validator: F,
314 ) -> VortexResult<()>
315 where
316 F: Fn(&[u8]) -> bool,
317 {
318 for (idx, &view) in views.iter().enumerate() {
319 if validity.is_null(idx)? {
320 continue;
321 }
322
323 if view.is_inlined() {
324 let bytes = &view.as_inlined().data[..view.len() as usize];
326 vortex_ensure!(
327 validator(bytes),
328 InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
329 );
330 } else {
331 let view = view.as_view();
333 let buf_index = view.buffer_index as usize;
334 let start_offset = view.offset as usize;
335 let end_offset = start_offset.saturating_add(view.size as usize);
336
337 let buf = buffers.get(buf_index).ok_or_else(||
338 vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewData with {} buffers",
339 buffers.len()))?;
340
341 vortex_ensure!(
342 start_offset < buf.len(),
343 InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
344 buf.len(),
345 );
346
347 vortex_ensure!(
348 end_offset <= buf.len(),
349 InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
350 buf.len(),
351 );
352
353 let bytes = &buf[start_offset..end_offset];
355 vortex_ensure!(
356 view.prefix == bytes[..4],
357 InvalidArgument: "VarBinView prefix does not match full string"
358 );
359
360 vortex_ensure!(
362 validator(bytes),
363 InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
364 );
365 }
366 }
367
368 Ok(())
369 }
370
371 pub fn len(&self) -> usize {
373 self.views.len() / size_of::<BinaryView>()
374 }
375
376 pub fn is_empty(&self) -> bool {
378 self.len() == 0
379 }
380
381 #[inline]
387 pub fn views(&self) -> &[BinaryView] {
388 let host_views = self.views.as_host();
389 let len = host_views.len() / size_of::<BinaryView>();
390
391 unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
393 }
394
395 pub fn views_handle(&self) -> &BufferHandle {
397 &self.views
398 }
399
400 #[inline]
404 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
405 let views = self.views();
406 let view = &views[index];
407 if !view.is_inlined() {
409 let view_ref = view.as_view();
410 self.buffer(view_ref.buffer_index as usize)
411 .slice(view_ref.as_range())
412 } else {
413 self.views_handle()
415 .as_host()
416 .clone()
417 .into_byte_buffer()
418 .slice_ref(view.as_inlined().value())
419 }
420 }
421
422 #[inline]
429 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
430 if idx >= self.data_buffers().len() {
431 vortex_panic!(
432 "{idx} buffer index out of bounds, there are {} buffers",
433 self.data_buffers().len()
434 );
435 }
436 self.buffers[idx].as_host()
437 }
438
439 #[inline]
441 pub fn data_buffers(&self) -> &Arc<[BufferHandle]> {
442 &self.buffers
443 }
444
445 #[expect(
447 clippy::same_name_method,
448 reason = "intentionally named from_iter like Iterator::from_iter"
449 )]
450 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
451 iter: I,
452 dtype: DType,
453 ) -> Self {
454 let iter = iter.into_iter();
455 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
456
457 for item in iter {
458 match item {
459 None => builder.append_null(),
460 Some(v) => builder.append_value(v),
461 }
462 }
463
464 builder.finish_into_varbinview().into_data()
465 }
466
467 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
468 let iter = iter.into_iter();
469 let mut builder = VarBinViewBuilder::with_capacity(
470 DType::Utf8(Nullability::NonNullable),
471 iter.size_hint().0,
472 );
473
474 for item in iter {
475 builder.append_value(item.as_ref());
476 }
477
478 builder.finish_into_varbinview().into_data()
479 }
480
481 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
482 iter: I,
483 ) -> Self {
484 let iter = iter.into_iter();
485 let mut builder = VarBinViewBuilder::with_capacity(
486 DType::Utf8(Nullability::Nullable),
487 iter.size_hint().0,
488 );
489
490 for item in iter {
491 match item {
492 None => builder.append_null(),
493 Some(v) => builder.append_value(v.as_ref()),
494 }
495 }
496
497 builder.finish_into_varbinview().into_data()
498 }
499
500 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
501 let iter = iter.into_iter();
502 let mut builder = VarBinViewBuilder::with_capacity(
503 DType::Binary(Nullability::NonNullable),
504 iter.size_hint().0,
505 );
506
507 for item in iter {
508 builder.append_value(item.as_ref());
509 }
510
511 builder.finish_into_varbinview().into_data()
512 }
513
514 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
515 iter: I,
516 ) -> Self {
517 let iter = iter.into_iter();
518 let mut builder = VarBinViewBuilder::with_capacity(
519 DType::Binary(Nullability::Nullable),
520 iter.size_hint().0,
521 );
522
523 for item in iter {
524 match item {
525 None => builder.append_null(),
526 Some(v) => builder.append_value(v.as_ref()),
527 }
528 }
529
530 builder.finish_into_varbinview().into_data()
531 }
532}
533
534pub trait VarBinViewArrayExt: TypedArrayRef<VarBinView> {
535 fn dtype_parts(&self) -> (bool, Nullability) {
536 match self.as_ref().dtype() {
537 DType::Utf8(nullability) => (true, *nullability),
538 DType::Binary(nullability) => (false, *nullability),
539 _ => unreachable!("VarBinViewArrayExt requires a utf8 or binary dtype"),
540 }
541 }
542
543 fn varbinview_validity(&self) -> Validity {
544 child_to_validity(
545 self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
546 self.dtype_parts().1,
547 )
548 }
549}
550impl<T: TypedArrayRef<VarBinView>> VarBinViewArrayExt for T {}
551
552impl Array<VarBinView> {
553 #[inline]
554 fn from_prevalidated_data(
555 dtype: DType,
556 data: VarBinViewData,
557 slots: Vec<Option<ArrayRef>>,
558 ) -> Self {
559 let len = data.len();
560 unsafe {
561 Array::from_parts_unchecked(
562 ArrayParts::new(VarBinView, dtype, len, data).with_slots(slots),
563 )
564 }
565 }
566
567 #[expect(
569 clippy::same_name_method,
570 reason = "intentionally named from_iter like Iterator::from_iter"
571 )]
572 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
573 iter: I,
574 dtype: DType,
575 ) -> Self {
576 let iter = iter.into_iter();
577 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
578 for value in iter {
579 match value {
580 Some(value) => builder.append_value(value),
581 None => builder.append_null(),
582 }
583 }
584 builder.finish_into_varbinview()
585 }
586
587 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
588 let iter = iter.into_iter();
589 let mut builder = VarBinViewBuilder::with_capacity(
590 DType::Utf8(Nullability::NonNullable),
591 iter.size_hint().0,
592 );
593 for value in iter {
594 builder.append_value(value.as_ref());
595 }
596 builder.finish_into_varbinview()
597 }
598
599 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
600 iter: I,
601 ) -> Self {
602 let iter = iter.into_iter();
603 let mut builder = VarBinViewBuilder::with_capacity(
604 DType::Utf8(Nullability::Nullable),
605 iter.size_hint().0,
606 );
607 for value in iter {
608 match value {
609 Some(value) => builder.append_value(value.as_ref()),
610 None => builder.append_null(),
611 }
612 }
613 builder.finish_into_varbinview()
614 }
615
616 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
617 let iter = iter.into_iter();
618 let mut builder = VarBinViewBuilder::with_capacity(
619 DType::Binary(Nullability::NonNullable),
620 iter.size_hint().0,
621 );
622 for value in iter {
623 builder.append_value(value.as_ref());
624 }
625 builder.finish_into_varbinview()
626 }
627
628 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
629 iter: I,
630 ) -> Self {
631 let iter = iter.into_iter();
632 let mut builder = VarBinViewBuilder::with_capacity(
633 DType::Binary(Nullability::Nullable),
634 iter.size_hint().0,
635 );
636 for value in iter {
637 match value {
638 Some(value) => builder.append_value(value.as_ref()),
639 None => builder.append_null(),
640 }
641 }
642 builder.finish_into_varbinview()
643 }
644
645 pub fn try_new(
647 views: Buffer<BinaryView>,
648 buffers: Arc<[ByteBuffer]>,
649 dtype: DType,
650 validity: Validity,
651 ) -> VortexResult<Self> {
652 let data = VarBinViewData::try_new(views, buffers, dtype.clone(), validity.clone())?;
653 let slots = VarBinViewData::make_slots(&validity, data.len());
654 Ok(Self::from_prevalidated_data(dtype, data, slots))
655 }
656
657 pub unsafe fn new_unchecked(
663 views: Buffer<BinaryView>,
664 buffers: Arc<[ByteBuffer]>,
665 dtype: DType,
666 validity: Validity,
667 ) -> Self {
668 let data = unsafe {
669 VarBinViewData::new_unchecked(views, buffers, dtype.clone(), validity.clone())
670 };
671 let slots = VarBinViewData::make_slots(&validity, data.len());
672 Self::from_prevalidated_data(dtype, data, slots)
673 }
674
675 pub fn new_handle(
677 views: BufferHandle,
678 buffers: Arc<[BufferHandle]>,
679 dtype: DType,
680 validity: Validity,
681 ) -> Self {
682 let data = VarBinViewData::new_handle(views, buffers, dtype.clone(), validity.clone());
683 let slots = VarBinViewData::make_slots(&validity, data.len());
684 Self::from_prevalidated_data(dtype, data, slots)
685 }
686
687 pub unsafe fn new_handle_unchecked(
693 views: BufferHandle,
694 buffers: Arc<[BufferHandle]>,
695 dtype: DType,
696 validity: Validity,
697 ) -> Self {
698 let data = unsafe {
699 VarBinViewData::new_handle_unchecked(views, buffers, dtype.clone(), validity.clone())
700 };
701 let slots = VarBinViewData::make_slots(&validity, data.len());
702 Self::from_prevalidated_data(dtype, data, slots)
703 }
704
705 pub fn into_data_parts(self) -> VarBinViewDataParts {
706 let dtype = self.dtype().clone();
707 let validity = self.varbinview_validity();
708 let data = self.into_data();
709 VarBinViewDataParts {
710 dtype,
711 buffers: data.buffers,
712 views: data.views,
713 validity,
714 }
715 }
716}
717
718impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewData {
719 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
720 Self::from_iter_nullable_bin(iter)
721 }
722}
723
724impl FromIterator<Option<Vec<u8>>> for VarBinViewData {
725 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
726 Self::from_iter_nullable_bin(iter)
727 }
728}
729
730impl FromIterator<Option<String>> for VarBinViewData {
731 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
732 Self::from_iter_nullable_str(iter)
733 }
734}
735
736impl<'a> FromIterator<Option<&'a str>> for VarBinViewData {
737 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
738 Self::from_iter_nullable_str(iter)
739 }
740}
741
742impl<'a> FromIterator<Option<&'a [u8]>> for Array<VarBinView> {
745 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
746 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
747 }
748}
749
750impl FromIterator<Option<Vec<u8>>> for Array<VarBinView> {
751 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
752 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
753 }
754}
755
756impl FromIterator<Option<String>> for Array<VarBinView> {
757 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
758 Self::from_iter_nullable_str(iter)
759 }
760}
761
762impl<'a> FromIterator<Option<&'a str>> for Array<VarBinView> {
763 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
764 Self::from_iter_nullable_str(iter)
765 }
766}