1use std::fmt::Display;
5use std::fmt::Formatter;
6use std::mem::size_of;
7use std::sync::Arc;
8
9use smallvec::smallvec;
10use vortex_buffer::Alignment;
11use vortex_buffer::Buffer;
12use vortex_buffer::ByteBuffer;
13use vortex_error::VortexExpect;
14use vortex_error::VortexResult;
15use vortex_error::vortex_bail;
16use vortex_error::vortex_ensure;
17use vortex_error::vortex_err;
18use vortex_error::vortex_panic;
19
20use crate::ArraySlots;
21use crate::LEGACY_SESSION;
22use crate::VortexSessionExecute;
23use crate::array::Array;
24use crate::array::ArrayParts;
25use crate::array::TypedArrayRef;
26use crate::array::child_to_validity;
27use crate::array::validity_to_child;
28use crate::arrays::VarBinView;
29use crate::arrays::varbinview::BinaryView;
30use crate::buffer::BufferHandle;
31use crate::builders::ArrayBuilder;
32use crate::builders::VarBinViewBuilder;
33use crate::dtype::DType;
34use crate::dtype::Nullability;
35use crate::validity::Validity;
36
37pub(super) const VALIDITY_SLOT: usize = 0;
39pub(super) const NUM_SLOTS: usize = 1;
40pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
41
42#[derive(Clone, Debug)]
102pub struct VarBinViewData {
103 pub(super) buffers: Arc<[BufferHandle]>,
104 pub(super) views: BufferHandle,
105}
106
107impl Display for VarBinViewData {
108 fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
109 Ok(())
110 }
111}
112
113pub struct VarBinViewDataParts {
114 pub dtype: DType,
115 pub buffers: Arc<[BufferHandle]>,
116 pub views: BufferHandle,
117 pub validity: Validity,
118}
119
120impl VarBinViewData {
121 fn dtype_parts(dtype: &DType) -> VortexResult<(bool, Nullability)> {
122 match dtype {
123 DType::Utf8(nullability) => Ok((true, *nullability)),
124 DType::Binary(nullability) => Ok((false, *nullability)),
125 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
126 }
127 }
128
129 pub(super) fn make_slots(validity: &Validity, len: usize) -> ArraySlots {
131 smallvec![validity_to_child(validity, len)]
132 }
133
134 pub fn new(
141 views: Buffer<BinaryView>,
142 buffers: Arc<[ByteBuffer]>,
143 dtype: DType,
144 validity: Validity,
145 ) -> Self {
146 Self::try_new(views, buffers, dtype, validity)
147 .vortex_expect("VarBinViewArray construction failed")
148 }
149
150 pub fn new_handle(
157 views: BufferHandle,
158 buffers: Arc<[BufferHandle]>,
159 dtype: DType,
160 validity: Validity,
161 ) -> Self {
162 Self::try_new_handle(views, buffers, dtype, validity)
163 .vortex_expect("VarbinViewArray construction failed")
164 }
165
166 pub fn try_new(
175 views: Buffer<BinaryView>,
176 buffers: Arc<[ByteBuffer]>,
177 dtype: DType,
178 validity: Validity,
179 ) -> VortexResult<Self> {
180 Self::validate(&views, &buffers, &dtype, &validity)?;
181
182 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
184 }
185
186 pub fn try_new_handle(
195 views: BufferHandle,
196 buffers: Arc<[BufferHandle]>,
197 dtype: DType,
198 validity: Validity,
199 ) -> VortexResult<Self> {
200 let views_nbytes = views.len();
201 vortex_ensure!(
202 views_nbytes.is_multiple_of(size_of::<BinaryView>()),
203 "Expected views buffer length ({views_nbytes}) to be a multiple of {}",
204 size_of::<BinaryView>()
205 );
206
207 if let Some(host) = views.as_host_opt() {
209 vortex_ensure!(
210 host.is_aligned(Alignment::of::<BinaryView>()),
211 "Views on host must be 16 byte aligned"
212 );
213 }
214
215 Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
217 }
218
219 pub unsafe fn new_unchecked(
249 views: Buffer<BinaryView>,
250 buffers: Arc<[ByteBuffer]>,
251 dtype: DType,
252 validity: Validity,
253 ) -> Self {
254 #[cfg(debug_assertions)]
255 Self::validate(&views, &buffers, &dtype, &validity)
256 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
257
258 let handles: Vec<BufferHandle> = buffers
259 .iter()
260 .cloned()
261 .map(BufferHandle::new_host)
262 .collect();
263
264 let handles = Arc::from(handles);
265 let view_handle = BufferHandle::new_host(views.into_byte_buffer());
266 unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
267 }
268
269 pub unsafe fn new_handle_unchecked(
275 views: BufferHandle,
276 buffers: Arc<[BufferHandle]>,
277 dtype: DType,
278 _validity: Validity,
279 ) -> Self {
280 let _ =
281 Self::dtype_parts(&dtype).vortex_expect("VarBinViewArray dtype must be utf8 or binary");
282 Self { buffers, views }
283 }
284
285 pub fn validate(
289 views: &Buffer<BinaryView>,
290 buffers: &Arc<[ByteBuffer]>,
291 dtype: &DType,
292 validity: &Validity,
293 ) -> VortexResult<()> {
294 vortex_ensure!(
295 validity.nullability() == dtype.nullability(),
296 InvalidArgument: "validity {:?} incompatible with nullability {:?}",
297 validity,
298 dtype.nullability()
299 );
300
301 match dtype {
302 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
303 simdutf8::basic::from_utf8(string).is_ok()
304 })?,
305 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
306 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
307 }
308
309 Ok(())
310 }
311
312 fn validate_views<F>(
313 views: &Buffer<BinaryView>,
314 buffers: &Arc<[ByteBuffer]>,
315 validity: &Validity,
316 validator: F,
317 ) -> VortexResult<()>
318 where
319 F: Fn(&[u8]) -> bool,
320 {
321 let validate_view = |idx: usize, view: &BinaryView| -> VortexResult<()> {
322 if view.is_inlined() {
323 let bytes = &view.as_inlined().data[..view.len() as usize];
325 vortex_ensure!(
326 validator(bytes),
327 InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
328 );
329 } else {
330 let view = view.as_view();
332 let buf_index = view.buffer_index as usize;
333 let start_offset = view.offset as usize;
334 let end_offset = start_offset.saturating_add(view.size as usize);
335
336 let buf = buffers.get(buf_index).ok_or_else(||
337 vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewData with {} buffers",
338 buffers.len()))?;
339
340 vortex_ensure!(
341 start_offset < buf.len(),
342 InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
343 buf.len(),
344 );
345
346 vortex_ensure!(
347 end_offset <= buf.len(),
348 InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
349 buf.len(),
350 );
351
352 let bytes = &buf[start_offset..end_offset];
354 vortex_ensure!(
355 view.prefix == bytes[..4],
356 InvalidArgument: "VarBinView prefix does not match full string"
357 );
358
359 vortex_ensure!(
361 validator(bytes),
362 InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
363 );
364 }
365 Ok(())
366 };
367
368 match validity {
369 Validity::Array(_) => {
373 let mut ctx = LEGACY_SESSION.create_execution_ctx();
374 let mask = validity.execute_mask(views.len(), &mut ctx)?;
375 for ((idx, view), valid) in views.iter().enumerate().zip(mask.iter()) {
376 if valid {
377 validate_view(idx, view)?;
378 }
379 }
380 }
381 Validity::AllInvalid => {}
383 Validity::NonNullable | Validity::AllValid => {
385 for (idx, view) in views.iter().enumerate() {
386 validate_view(idx, view)?;
387 }
388 }
389 }
390
391 Ok(())
392 }
393
394 pub fn len(&self) -> usize {
396 self.views.len() / size_of::<BinaryView>()
397 }
398
399 pub fn is_empty(&self) -> bool {
401 self.len() == 0
402 }
403
404 #[inline]
410 pub fn views(&self) -> &[BinaryView] {
411 let host_views = self.views.as_host();
412 let len = host_views.len() / size_of::<BinaryView>();
413
414 unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
416 }
417
418 pub fn views_handle(&self) -> &BufferHandle {
420 &self.views
421 }
422
423 #[inline]
427 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
428 let views = self.views();
429 let view = &views[index];
430 if !view.is_inlined() {
432 let view_ref = view.as_view();
433 self.buffer(view_ref.buffer_index as usize)
434 .slice(view_ref.as_range())
435 } else {
436 self.views_handle()
438 .as_host()
439 .clone()
440 .into_byte_buffer()
441 .slice_ref(view.as_inlined().value())
442 }
443 }
444
445 #[inline]
452 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
453 if idx >= self.data_buffers().len() {
454 vortex_panic!(
455 "{idx} buffer index out of bounds, there are {} buffers",
456 self.data_buffers().len()
457 );
458 }
459 self.buffers[idx].as_host()
460 }
461
462 #[inline]
464 pub fn data_buffers(&self) -> &Arc<[BufferHandle]> {
465 &self.buffers
466 }
467
468 #[expect(
470 clippy::same_name_method,
471 reason = "intentionally named from_iter like Iterator::from_iter"
472 )]
473 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
474 iter: I,
475 dtype: DType,
476 ) -> Self {
477 let iter = iter.into_iter();
478 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
479
480 for item in iter {
481 match item {
482 None => builder.append_null(),
483 Some(v) => builder.append_value(v),
484 }
485 }
486
487 builder.finish_into_varbinview().into_data()
488 }
489
490 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
491 let iter = iter.into_iter();
492 let mut builder = VarBinViewBuilder::with_capacity(
493 DType::Utf8(Nullability::NonNullable),
494 iter.size_hint().0,
495 );
496
497 for item in iter {
498 builder.append_value(item.as_ref());
499 }
500
501 builder.finish_into_varbinview().into_data()
502 }
503
504 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
505 iter: I,
506 ) -> Self {
507 let iter = iter.into_iter();
508 let mut builder = VarBinViewBuilder::with_capacity(
509 DType::Utf8(Nullability::Nullable),
510 iter.size_hint().0,
511 );
512
513 for item in iter {
514 match item {
515 None => builder.append_null(),
516 Some(v) => builder.append_value(v.as_ref()),
517 }
518 }
519
520 builder.finish_into_varbinview().into_data()
521 }
522
523 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
524 let iter = iter.into_iter();
525 let mut builder = VarBinViewBuilder::with_capacity(
526 DType::Binary(Nullability::NonNullable),
527 iter.size_hint().0,
528 );
529
530 for item in iter {
531 builder.append_value(item.as_ref());
532 }
533
534 builder.finish_into_varbinview().into_data()
535 }
536
537 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
538 iter: I,
539 ) -> Self {
540 let iter = iter.into_iter();
541 let mut builder = VarBinViewBuilder::with_capacity(
542 DType::Binary(Nullability::Nullable),
543 iter.size_hint().0,
544 );
545
546 for item in iter {
547 match item {
548 None => builder.append_null(),
549 Some(v) => builder.append_value(v.as_ref()),
550 }
551 }
552
553 builder.finish_into_varbinview().into_data()
554 }
555}
556
557pub trait VarBinViewArrayExt: TypedArrayRef<VarBinView> {
558 fn dtype_parts(&self) -> (bool, Nullability) {
559 match self.as_ref().dtype() {
560 DType::Utf8(nullability) => (true, *nullability),
561 DType::Binary(nullability) => (false, *nullability),
562 _ => unreachable!("VarBinViewArrayExt requires a utf8 or binary dtype"),
563 }
564 }
565
566 fn varbinview_validity(&self) -> Validity {
567 child_to_validity(
568 self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
569 self.dtype_parts().1,
570 )
571 }
572}
573impl<T: TypedArrayRef<VarBinView>> VarBinViewArrayExt for T {}
574
575impl Array<VarBinView> {
576 #[inline]
577 fn from_prevalidated_data(dtype: DType, data: VarBinViewData, slots: ArraySlots) -> Self {
578 let len = data.len();
579 unsafe {
580 Array::from_parts_unchecked(
581 ArrayParts::new(VarBinView, dtype, len, data).with_slots(slots),
582 )
583 }
584 }
585
586 #[expect(
588 clippy::same_name_method,
589 reason = "intentionally named from_iter like Iterator::from_iter"
590 )]
591 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
592 iter: I,
593 dtype: DType,
594 ) -> Self {
595 let iter = iter.into_iter();
596 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
597 for value in iter {
598 match value {
599 Some(value) => builder.append_value(value),
600 None => builder.append_null(),
601 }
602 }
603 builder.finish_into_varbinview()
604 }
605
606 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
607 let iter = iter.into_iter();
608 let mut builder = VarBinViewBuilder::with_capacity(
609 DType::Utf8(Nullability::NonNullable),
610 iter.size_hint().0,
611 );
612 for value in iter {
613 builder.append_value(value.as_ref());
614 }
615 builder.finish_into_varbinview()
616 }
617
618 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
619 iter: I,
620 ) -> Self {
621 let iter = iter.into_iter();
622 let mut builder = VarBinViewBuilder::with_capacity(
623 DType::Utf8(Nullability::Nullable),
624 iter.size_hint().0,
625 );
626 for value in iter {
627 match value {
628 Some(value) => builder.append_value(value.as_ref()),
629 None => builder.append_null(),
630 }
631 }
632 builder.finish_into_varbinview()
633 }
634
635 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
636 let iter = iter.into_iter();
637 let mut builder = VarBinViewBuilder::with_capacity(
638 DType::Binary(Nullability::NonNullable),
639 iter.size_hint().0,
640 );
641 for value in iter {
642 builder.append_value(value.as_ref());
643 }
644 builder.finish_into_varbinview()
645 }
646
647 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
648 iter: I,
649 ) -> Self {
650 let iter = iter.into_iter();
651 let mut builder = VarBinViewBuilder::with_capacity(
652 DType::Binary(Nullability::Nullable),
653 iter.size_hint().0,
654 );
655 for value in iter {
656 match value {
657 Some(value) => builder.append_value(value.as_ref()),
658 None => builder.append_null(),
659 }
660 }
661 builder.finish_into_varbinview()
662 }
663
664 pub fn try_new(
666 views: Buffer<BinaryView>,
667 buffers: Arc<[ByteBuffer]>,
668 dtype: DType,
669 validity: Validity,
670 ) -> VortexResult<Self> {
671 let data = VarBinViewData::try_new(views, buffers, dtype.clone(), validity.clone())?;
672 let slots = VarBinViewData::make_slots(&validity, data.len());
673 Ok(Self::from_prevalidated_data(dtype, data, slots))
674 }
675
676 pub unsafe fn new_unchecked(
682 views: Buffer<BinaryView>,
683 buffers: Arc<[ByteBuffer]>,
684 dtype: DType,
685 validity: Validity,
686 ) -> Self {
687 let data = unsafe {
688 VarBinViewData::new_unchecked(views, buffers, dtype.clone(), validity.clone())
689 };
690 let slots = VarBinViewData::make_slots(&validity, data.len());
691 Self::from_prevalidated_data(dtype, data, slots)
692 }
693
694 pub fn new_handle(
696 views: BufferHandle,
697 buffers: Arc<[BufferHandle]>,
698 dtype: DType,
699 validity: Validity,
700 ) -> Self {
701 let data = VarBinViewData::new_handle(views, buffers, dtype.clone(), validity.clone());
702 let slots = VarBinViewData::make_slots(&validity, data.len());
703 Self::from_prevalidated_data(dtype, data, slots)
704 }
705
706 pub unsafe fn new_handle_unchecked(
712 views: BufferHandle,
713 buffers: Arc<[BufferHandle]>,
714 dtype: DType,
715 validity: Validity,
716 ) -> Self {
717 let data = unsafe {
718 VarBinViewData::new_handle_unchecked(views, buffers, dtype.clone(), validity.clone())
719 };
720 let slots = VarBinViewData::make_slots(&validity, data.len());
721 Self::from_prevalidated_data(dtype, data, slots)
722 }
723
724 pub fn into_data_parts(self) -> VarBinViewDataParts {
725 let dtype = self.dtype().clone();
726 let validity = self.varbinview_validity();
727 let data = self.into_data();
728 VarBinViewDataParts {
729 dtype,
730 buffers: data.buffers,
731 views: data.views,
732 validity,
733 }
734 }
735}
736
737impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewData {
738 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
739 Self::from_iter_nullable_bin(iter)
740 }
741}
742
743impl FromIterator<Option<Vec<u8>>> for VarBinViewData {
744 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
745 Self::from_iter_nullable_bin(iter)
746 }
747}
748
749impl FromIterator<Option<String>> for VarBinViewData {
750 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
751 Self::from_iter_nullable_str(iter)
752 }
753}
754
755impl<'a> FromIterator<Option<&'a str>> for VarBinViewData {
756 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
757 Self::from_iter_nullable_str(iter)
758 }
759}
760
761impl<'a> FromIterator<Option<&'a [u8]>> for Array<VarBinView> {
764 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
765 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
766 }
767}
768
769impl FromIterator<Option<Vec<u8>>> for Array<VarBinView> {
770 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
771 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
772 }
773}
774
775impl FromIterator<Option<String>> for Array<VarBinView> {
776 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
777 Self::from_iter_nullable_str(iter)
778 }
779}
780
781impl<'a> FromIterator<Option<&'a str>> for Array<VarBinView> {
782 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
783 Self::from_iter_nullable_str(iter)
784 }
785}