1use std::fmt::Display;
5use std::fmt::Formatter;
6use std::mem::size_of;
7use std::sync::Arc;
8
9use vortex_buffer::Alignment;
10use vortex_buffer::Buffer;
11use vortex_buffer::ByteBuffer;
12use vortex_error::VortexExpect;
13use vortex_error::VortexResult;
14use vortex_error::vortex_bail;
15use vortex_error::vortex_ensure;
16use vortex_error::vortex_err;
17use vortex_error::vortex_panic;
18use vortex_mask::Mask;
19
20use crate::ArrayRef;
21use crate::array::Array;
22use crate::array::ArrayParts;
23use crate::array::TypedArrayRef;
24use crate::array::child_to_validity;
25use crate::array::validity_to_child;
26use crate::arrays::VarBinView;
27use crate::arrays::varbinview::BinaryView;
28use crate::buffer::BufferHandle;
29use crate::builders::ArrayBuilder;
30use crate::builders::VarBinViewBuilder;
31use crate::dtype::DType;
32use crate::dtype::Nullability;
33use crate::validity::Validity;
34
35pub(super) const VALIDITY_SLOT: usize = 0;
37pub(super) const NUM_SLOTS: usize = 1;
38pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
39
40#[derive(Clone, Debug)]
100pub struct VarBinViewData {
101 pub(super) buffers: Arc<[BufferHandle]>,
102 pub(super) views: BufferHandle,
103}
104
105impl Display for VarBinViewData {
106 fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
107 Ok(())
108 }
109}
110
111pub struct VarBinViewDataParts {
112 pub dtype: DType,
113 pub buffers: Arc<[BufferHandle]>,
114 pub views: BufferHandle,
115 pub validity: Validity,
116}
117
118impl VarBinViewData {
119 fn dtype_parts(dtype: &DType) -> VortexResult<(bool, Nullability)> {
120 match dtype {
121 DType::Utf8(nullability) => Ok((true, *nullability)),
122 DType::Binary(nullability) => Ok((false, *nullability)),
123 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
124 }
125 }
126
127 pub(super) fn make_slots(validity: &Validity, len: usize) -> Vec<Option<ArrayRef>> {
129 vec![validity_to_child(validity, len)]
130 }
131
132 pub fn new(
139 views: Buffer<BinaryView>,
140 buffers: Arc<[ByteBuffer]>,
141 dtype: DType,
142 validity: Validity,
143 ) -> Self {
144 Self::try_new(views, buffers, dtype, validity)
145 .vortex_expect("VarBinViewArray construction failed")
146 }
147
148 pub fn new_handle(
155 views: BufferHandle,
156 buffers: Arc<[BufferHandle]>,
157 dtype: DType,
158 validity: Validity,
159 ) -> Self {
160 Self::try_new_handle(views, buffers, dtype, validity)
161 .vortex_expect("VarbinViewArray construction failed")
162 }
163
164 pub fn try_new(
173 views: Buffer<BinaryView>,
174 buffers: Arc<[ByteBuffer]>,
175 dtype: DType,
176 validity: Validity,
177 ) -> VortexResult<Self> {
178 Self::validate(&views, &buffers, &dtype, &validity)?;
179
180 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
182 }
183
184 pub fn try_new_handle(
193 views: BufferHandle,
194 buffers: Arc<[BufferHandle]>,
195 dtype: DType,
196 validity: Validity,
197 ) -> VortexResult<Self> {
198 let views_nbytes = views.len();
199 vortex_ensure!(
200 views_nbytes.is_multiple_of(size_of::<BinaryView>()),
201 "Expected views buffer length ({views_nbytes}) to be a multiple of {}",
202 size_of::<BinaryView>()
203 );
204
205 if let Some(host) = views.as_host_opt() {
207 vortex_ensure!(
208 host.is_aligned(Alignment::of::<BinaryView>()),
209 "Views on host must be 16 byte aligned"
210 );
211 }
212
213 Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
215 }
216
217 pub unsafe fn new_unchecked(
247 views: Buffer<BinaryView>,
248 buffers: Arc<[ByteBuffer]>,
249 dtype: DType,
250 validity: Validity,
251 ) -> Self {
252 #[cfg(debug_assertions)]
253 Self::validate(&views, &buffers, &dtype, &validity)
254 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
255
256 let handles: Vec<BufferHandle> = buffers
257 .iter()
258 .cloned()
259 .map(BufferHandle::new_host)
260 .collect();
261
262 let handles = Arc::from(handles);
263 let view_handle = BufferHandle::new_host(views.into_byte_buffer());
264 unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
265 }
266
267 pub unsafe fn new_handle_unchecked(
273 views: BufferHandle,
274 buffers: Arc<[BufferHandle]>,
275 dtype: DType,
276 _validity: Validity,
277 ) -> Self {
278 let _ =
279 Self::dtype_parts(&dtype).vortex_expect("VarBinViewArray dtype must be utf8 or binary");
280 Self { buffers, views }
281 }
282
283 pub fn validate(
287 views: &Buffer<BinaryView>,
288 buffers: &Arc<[ByteBuffer]>,
289 dtype: &DType,
290 validity: &Validity,
291 ) -> VortexResult<()> {
292 vortex_ensure!(
293 validity.nullability() == dtype.nullability(),
294 InvalidArgument: "validity {:?} incompatible with nullability {:?}",
295 validity,
296 dtype.nullability()
297 );
298
299 match dtype {
300 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
301 simdutf8::basic::from_utf8(string).is_ok()
302 })?,
303 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
304 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
305 }
306
307 Ok(())
308 }
309
310 fn validate_views<F>(
311 views: &Buffer<BinaryView>,
312 buffers: &Arc<[ByteBuffer]>,
313 validity: &Validity,
314 validator: F,
315 ) -> VortexResult<()>
316 where
317 F: Fn(&[u8]) -> bool,
318 {
319 for (idx, &view) in views.iter().enumerate() {
320 if validity.is_null(idx)? {
321 continue;
322 }
323
324 if view.is_inlined() {
325 let bytes = &view.as_inlined().data[..view.len() as usize];
327 vortex_ensure!(
328 validator(bytes),
329 InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
330 );
331 } else {
332 let view = view.as_view();
334 let buf_index = view.buffer_index as usize;
335 let start_offset = view.offset as usize;
336 let end_offset = start_offset.saturating_add(view.size as usize);
337
338 let buf = buffers.get(buf_index).ok_or_else(||
339 vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewData with {} buffers",
340 buffers.len()))?;
341
342 vortex_ensure!(
343 start_offset < buf.len(),
344 InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
345 buf.len(),
346 );
347
348 vortex_ensure!(
349 end_offset <= buf.len(),
350 InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
351 buf.len(),
352 );
353
354 let bytes = &buf[start_offset..end_offset];
356 vortex_ensure!(
357 view.prefix == bytes[..4],
358 InvalidArgument: "VarBinView prefix does not match full string"
359 );
360
361 vortex_ensure!(
363 validator(bytes),
364 InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
365 );
366 }
367 }
368
369 Ok(())
370 }
371
372 pub fn len(&self) -> usize {
374 self.views.len() / size_of::<BinaryView>()
375 }
376
377 pub fn is_empty(&self) -> bool {
379 self.len() == 0
380 }
381
382 #[inline]
388 pub fn views(&self) -> &[BinaryView] {
389 let host_views = self.views.as_host();
390 let len = host_views.len() / size_of::<BinaryView>();
391
392 unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
394 }
395
396 pub fn views_handle(&self) -> &BufferHandle {
398 &self.views
399 }
400
401 #[inline]
405 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
406 let views = self.views();
407 let view = &views[index];
408 if !view.is_inlined() {
410 let view_ref = view.as_view();
411 self.buffer(view_ref.buffer_index as usize)
412 .slice(view_ref.as_range())
413 } else {
414 self.views_handle()
416 .as_host()
417 .clone()
418 .into_byte_buffer()
419 .slice_ref(view.as_inlined().value())
420 }
421 }
422
423 #[inline]
430 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
431 if idx >= self.data_buffers().len() {
432 vortex_panic!(
433 "{idx} buffer index out of bounds, there are {} buffers",
434 self.data_buffers().len()
435 );
436 }
437 self.buffers[idx].as_host()
438 }
439
440 #[inline]
442 pub fn data_buffers(&self) -> &Arc<[BufferHandle]> {
443 &self.buffers
444 }
445
446 #[expect(
448 clippy::same_name_method,
449 reason = "intentionally named from_iter like Iterator::from_iter"
450 )]
451 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
452 iter: I,
453 dtype: DType,
454 ) -> Self {
455 let iter = iter.into_iter();
456 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
457
458 for item in iter {
459 match item {
460 None => builder.append_null(),
461 Some(v) => builder.append_value(v),
462 }
463 }
464
465 builder.finish_into_varbinview().into_data()
466 }
467
468 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
469 let iter = iter.into_iter();
470 let mut builder = VarBinViewBuilder::with_capacity(
471 DType::Utf8(Nullability::NonNullable),
472 iter.size_hint().0,
473 );
474
475 for item in iter {
476 builder.append_value(item.as_ref());
477 }
478
479 builder.finish_into_varbinview().into_data()
480 }
481
482 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
483 iter: I,
484 ) -> Self {
485 let iter = iter.into_iter();
486 let mut builder = VarBinViewBuilder::with_capacity(
487 DType::Utf8(Nullability::Nullable),
488 iter.size_hint().0,
489 );
490
491 for item in iter {
492 match item {
493 None => builder.append_null(),
494 Some(v) => builder.append_value(v.as_ref()),
495 }
496 }
497
498 builder.finish_into_varbinview().into_data()
499 }
500
501 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
502 let iter = iter.into_iter();
503 let mut builder = VarBinViewBuilder::with_capacity(
504 DType::Binary(Nullability::NonNullable),
505 iter.size_hint().0,
506 );
507
508 for item in iter {
509 builder.append_value(item.as_ref());
510 }
511
512 builder.finish_into_varbinview().into_data()
513 }
514
515 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
516 iter: I,
517 ) -> Self {
518 let iter = iter.into_iter();
519 let mut builder = VarBinViewBuilder::with_capacity(
520 DType::Binary(Nullability::Nullable),
521 iter.size_hint().0,
522 );
523
524 for item in iter {
525 match item {
526 None => builder.append_null(),
527 Some(v) => builder.append_value(v.as_ref()),
528 }
529 }
530
531 builder.finish_into_varbinview().into_data()
532 }
533}
534
535pub trait VarBinViewArrayExt: TypedArrayRef<VarBinView> {
536 fn dtype_parts(&self) -> (bool, Nullability) {
537 match self.as_ref().dtype() {
538 DType::Utf8(nullability) => (true, *nullability),
539 DType::Binary(nullability) => (false, *nullability),
540 _ => unreachable!("VarBinViewArrayExt requires a utf8 or binary dtype"),
541 }
542 }
543
544 fn varbinview_validity(&self) -> Validity {
545 child_to_validity(&self.as_ref().slots()[VALIDITY_SLOT], self.dtype_parts().1)
546 }
547
548 fn varbinview_validity_mask(&self) -> Mask {
549 self.varbinview_validity().to_mask(self.as_ref().len())
550 }
551}
552impl<T: TypedArrayRef<VarBinView>> VarBinViewArrayExt for T {}
553
554impl Array<VarBinView> {
555 #[inline]
556 fn from_prevalidated_data(
557 dtype: DType,
558 data: VarBinViewData,
559 slots: Vec<Option<ArrayRef>>,
560 ) -> Self {
561 let len = data.len();
562 unsafe {
563 Array::from_parts_unchecked(
564 ArrayParts::new(VarBinView, dtype, len, data).with_slots(slots),
565 )
566 }
567 }
568
569 #[expect(
571 clippy::same_name_method,
572 reason = "intentionally named from_iter like Iterator::from_iter"
573 )]
574 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
575 iter: I,
576 dtype: DType,
577 ) -> Self {
578 let iter = iter.into_iter();
579 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
580 for value in iter {
581 match value {
582 Some(value) => builder.append_value(value),
583 None => builder.append_null(),
584 }
585 }
586 builder.finish_into_varbinview()
587 }
588
589 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
590 let iter = iter.into_iter();
591 let mut builder = VarBinViewBuilder::with_capacity(
592 DType::Utf8(Nullability::NonNullable),
593 iter.size_hint().0,
594 );
595 for value in iter {
596 builder.append_value(value.as_ref());
597 }
598 builder.finish_into_varbinview()
599 }
600
601 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
602 iter: I,
603 ) -> Self {
604 let iter = iter.into_iter();
605 let mut builder = VarBinViewBuilder::with_capacity(
606 DType::Utf8(Nullability::Nullable),
607 iter.size_hint().0,
608 );
609 for value in iter {
610 match value {
611 Some(value) => builder.append_value(value.as_ref()),
612 None => builder.append_null(),
613 }
614 }
615 builder.finish_into_varbinview()
616 }
617
618 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
619 let iter = iter.into_iter();
620 let mut builder = VarBinViewBuilder::with_capacity(
621 DType::Binary(Nullability::NonNullable),
622 iter.size_hint().0,
623 );
624 for value in iter {
625 builder.append_value(value.as_ref());
626 }
627 builder.finish_into_varbinview()
628 }
629
630 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
631 iter: I,
632 ) -> Self {
633 let iter = iter.into_iter();
634 let mut builder = VarBinViewBuilder::with_capacity(
635 DType::Binary(Nullability::Nullable),
636 iter.size_hint().0,
637 );
638 for value in iter {
639 match value {
640 Some(value) => builder.append_value(value.as_ref()),
641 None => builder.append_null(),
642 }
643 }
644 builder.finish_into_varbinview()
645 }
646
647 pub fn try_new(
649 views: Buffer<BinaryView>,
650 buffers: Arc<[ByteBuffer]>,
651 dtype: DType,
652 validity: Validity,
653 ) -> VortexResult<Self> {
654 let data = VarBinViewData::try_new(views, buffers, dtype.clone(), validity.clone())?;
655 let slots = VarBinViewData::make_slots(&validity, data.len());
656 Ok(Self::from_prevalidated_data(dtype, data, slots))
657 }
658
659 pub unsafe fn new_unchecked(
665 views: Buffer<BinaryView>,
666 buffers: Arc<[ByteBuffer]>,
667 dtype: DType,
668 validity: Validity,
669 ) -> Self {
670 let data = unsafe {
671 VarBinViewData::new_unchecked(views, buffers, dtype.clone(), validity.clone())
672 };
673 let slots = VarBinViewData::make_slots(&validity, data.len());
674 Self::from_prevalidated_data(dtype, data, slots)
675 }
676
677 pub fn new_handle(
679 views: BufferHandle,
680 buffers: Arc<[BufferHandle]>,
681 dtype: DType,
682 validity: Validity,
683 ) -> Self {
684 let data = VarBinViewData::new_handle(views, buffers, dtype.clone(), validity.clone());
685 let slots = VarBinViewData::make_slots(&validity, data.len());
686 Self::from_prevalidated_data(dtype, data, slots)
687 }
688
689 pub unsafe fn new_handle_unchecked(
695 views: BufferHandle,
696 buffers: Arc<[BufferHandle]>,
697 dtype: DType,
698 validity: Validity,
699 ) -> Self {
700 let data = unsafe {
701 VarBinViewData::new_handle_unchecked(views, buffers, dtype.clone(), validity.clone())
702 };
703 let slots = VarBinViewData::make_slots(&validity, data.len());
704 Self::from_prevalidated_data(dtype, data, slots)
705 }
706
707 pub fn into_data_parts(self) -> VarBinViewDataParts {
708 let dtype = self.dtype().clone();
709 let validity = self.varbinview_validity();
710 let data = self.into_data();
711 VarBinViewDataParts {
712 dtype,
713 buffers: data.buffers,
714 views: data.views,
715 validity,
716 }
717 }
718}
719
720impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewData {
721 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
722 Self::from_iter_nullable_bin(iter)
723 }
724}
725
726impl FromIterator<Option<Vec<u8>>> for VarBinViewData {
727 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
728 Self::from_iter_nullable_bin(iter)
729 }
730}
731
732impl FromIterator<Option<String>> for VarBinViewData {
733 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
734 Self::from_iter_nullable_str(iter)
735 }
736}
737
738impl<'a> FromIterator<Option<&'a str>> for VarBinViewData {
739 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
740 Self::from_iter_nullable_str(iter)
741 }
742}
743
744impl<'a> FromIterator<Option<&'a [u8]>> for Array<VarBinView> {
747 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
748 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
749 }
750}
751
752impl FromIterator<Option<Vec<u8>>> for Array<VarBinView> {
753 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
754 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
755 }
756}
757
758impl FromIterator<Option<String>> for Array<VarBinView> {
759 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
760 Self::from_iter_nullable_str(iter)
761 }
762}
763
764impl<'a> FromIterator<Option<&'a str>> for Array<VarBinView> {
765 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
766 Self::from_iter_nullable_str(iter)
767 }
768}