1use std::fmt::Display;
5use std::fmt::Formatter;
6use std::mem::size_of;
7use std::sync::Arc;
8
9use smallvec::smallvec;
10use vortex_buffer::Alignment;
11use vortex_buffer::Buffer;
12use vortex_buffer::ByteBuffer;
13use vortex_error::VortexExpect;
14use vortex_error::VortexResult;
15use vortex_error::vortex_bail;
16use vortex_error::vortex_ensure;
17use vortex_error::vortex_err;
18use vortex_error::vortex_panic;
19
20use crate::ArraySlots;
21use crate::array::Array;
22use crate::array::ArrayParts;
23use crate::array::TypedArrayRef;
24use crate::array::child_to_validity;
25use crate::array::validity_to_child;
26use crate::arrays::VarBinView;
27use crate::arrays::varbinview::BinaryView;
28use crate::buffer::BufferHandle;
29use crate::builders::ArrayBuilder;
30use crate::builders::VarBinViewBuilder;
31use crate::dtype::DType;
32use crate::dtype::Nullability;
33use crate::validity::Validity;
34
35pub(super) const VALIDITY_SLOT: usize = 0;
37pub(super) const NUM_SLOTS: usize = 1;
38pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
39
40#[derive(Clone, Debug)]
100pub struct VarBinViewData {
101 pub(super) buffers: Arc<[BufferHandle]>,
102 pub(super) views: BufferHandle,
103}
104
105impl Display for VarBinViewData {
106 fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
107 Ok(())
108 }
109}
110
111pub struct VarBinViewDataParts {
112 pub dtype: DType,
113 pub buffers: Arc<[BufferHandle]>,
114 pub views: BufferHandle,
115 pub validity: Validity,
116}
117
118impl VarBinViewData {
119 fn dtype_parts(dtype: &DType) -> VortexResult<(bool, Nullability)> {
120 match dtype {
121 DType::Utf8(nullability) => Ok((true, *nullability)),
122 DType::Binary(nullability) => Ok((false, *nullability)),
123 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
124 }
125 }
126
127 pub(super) fn make_slots(validity: &Validity, len: usize) -> ArraySlots {
129 smallvec![validity_to_child(validity, len)]
130 }
131
132 pub fn new(
139 views: Buffer<BinaryView>,
140 buffers: Arc<[ByteBuffer]>,
141 dtype: DType,
142 validity: Validity,
143 ) -> Self {
144 Self::try_new(views, buffers, dtype, validity)
145 .vortex_expect("VarBinViewArray construction failed")
146 }
147
148 pub fn new_handle(
155 views: BufferHandle,
156 buffers: Arc<[BufferHandle]>,
157 dtype: DType,
158 validity: Validity,
159 ) -> Self {
160 Self::try_new_handle(views, buffers, dtype, validity)
161 .vortex_expect("VarbinViewArray construction failed")
162 }
163
164 pub fn try_new(
173 views: Buffer<BinaryView>,
174 buffers: Arc<[ByteBuffer]>,
175 dtype: DType,
176 validity: Validity,
177 ) -> VortexResult<Self> {
178 Self::validate(&views, &buffers, &dtype, &validity)?;
179
180 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
182 }
183
184 pub fn try_new_handle(
193 views: BufferHandle,
194 buffers: Arc<[BufferHandle]>,
195 dtype: DType,
196 validity: Validity,
197 ) -> VortexResult<Self> {
198 let views_nbytes = views.len();
199 vortex_ensure!(
200 views_nbytes.is_multiple_of(size_of::<BinaryView>()),
201 "Expected views buffer length ({views_nbytes}) to be a multiple of {}",
202 size_of::<BinaryView>()
203 );
204
205 if let Some(host) = views.as_host_opt() {
207 vortex_ensure!(
208 host.is_aligned(Alignment::of::<BinaryView>()),
209 "Views on host must be 16 byte aligned"
210 );
211 }
212
213 Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
215 }
216
217 pub unsafe fn new_unchecked(
247 views: Buffer<BinaryView>,
248 buffers: Arc<[ByteBuffer]>,
249 dtype: DType,
250 validity: Validity,
251 ) -> Self {
252 #[cfg(debug_assertions)]
253 Self::validate(&views, &buffers, &dtype, &validity)
254 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
255
256 let handles: Vec<BufferHandle> = buffers
257 .iter()
258 .cloned()
259 .map(BufferHandle::new_host)
260 .collect();
261
262 let handles = Arc::from(handles);
263 let view_handle = BufferHandle::new_host(views.into_byte_buffer());
264 unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
265 }
266
267 pub unsafe fn new_handle_unchecked(
273 views: BufferHandle,
274 buffers: Arc<[BufferHandle]>,
275 dtype: DType,
276 _validity: Validity,
277 ) -> Self {
278 let _ =
279 Self::dtype_parts(&dtype).vortex_expect("VarBinViewArray dtype must be utf8 or binary");
280 Self { buffers, views }
281 }
282
283 pub fn validate(
287 views: &Buffer<BinaryView>,
288 buffers: &Arc<[ByteBuffer]>,
289 dtype: &DType,
290 validity: &Validity,
291 ) -> VortexResult<()> {
292 vortex_ensure!(
293 validity.nullability() == dtype.nullability(),
294 InvalidArgument: "validity {:?} incompatible with nullability {:?}",
295 validity,
296 dtype.nullability()
297 );
298
299 match dtype {
300 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
301 simdutf8::basic::from_utf8(string).is_ok()
302 })?,
303 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
304 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
305 }
306
307 Ok(())
308 }
309
310 fn validate_views<F>(
311 views: &Buffer<BinaryView>,
312 buffers: &Arc<[ByteBuffer]>,
313 validity: &Validity,
314 validator: F,
315 ) -> VortexResult<()>
316 where
317 F: Fn(&[u8]) -> bool,
318 {
319 for (idx, &view) in views.iter().enumerate() {
320 if validity.is_null(idx)? {
321 continue;
322 }
323
324 if view.is_inlined() {
325 let bytes = &view.as_inlined().data[..view.len() as usize];
327 vortex_ensure!(
328 validator(bytes),
329 InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
330 );
331 } else {
332 let view = view.as_view();
334 let buf_index = view.buffer_index as usize;
335 let start_offset = view.offset as usize;
336 let end_offset = start_offset.saturating_add(view.size as usize);
337
338 let buf = buffers.get(buf_index).ok_or_else(||
339 vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewData with {} buffers",
340 buffers.len()))?;
341
342 vortex_ensure!(
343 start_offset < buf.len(),
344 InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
345 buf.len(),
346 );
347
348 vortex_ensure!(
349 end_offset <= buf.len(),
350 InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
351 buf.len(),
352 );
353
354 let bytes = &buf[start_offset..end_offset];
356 vortex_ensure!(
357 view.prefix == bytes[..4],
358 InvalidArgument: "VarBinView prefix does not match full string"
359 );
360
361 vortex_ensure!(
363 validator(bytes),
364 InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
365 );
366 }
367 }
368
369 Ok(())
370 }
371
372 pub fn len(&self) -> usize {
374 self.views.len() / size_of::<BinaryView>()
375 }
376
377 pub fn is_empty(&self) -> bool {
379 self.len() == 0
380 }
381
382 #[inline]
388 pub fn views(&self) -> &[BinaryView] {
389 let host_views = self.views.as_host();
390 let len = host_views.len() / size_of::<BinaryView>();
391
392 unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
394 }
395
396 pub fn views_handle(&self) -> &BufferHandle {
398 &self.views
399 }
400
401 #[inline]
405 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
406 let views = self.views();
407 let view = &views[index];
408 if !view.is_inlined() {
410 let view_ref = view.as_view();
411 self.buffer(view_ref.buffer_index as usize)
412 .slice(view_ref.as_range())
413 } else {
414 self.views_handle()
416 .as_host()
417 .clone()
418 .into_byte_buffer()
419 .slice_ref(view.as_inlined().value())
420 }
421 }
422
423 #[inline]
430 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
431 if idx >= self.data_buffers().len() {
432 vortex_panic!(
433 "{idx} buffer index out of bounds, there are {} buffers",
434 self.data_buffers().len()
435 );
436 }
437 self.buffers[idx].as_host()
438 }
439
440 #[inline]
442 pub fn data_buffers(&self) -> &Arc<[BufferHandle]> {
443 &self.buffers
444 }
445
446 #[expect(
448 clippy::same_name_method,
449 reason = "intentionally named from_iter like Iterator::from_iter"
450 )]
451 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
452 iter: I,
453 dtype: DType,
454 ) -> Self {
455 let iter = iter.into_iter();
456 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
457
458 for item in iter {
459 match item {
460 None => builder.append_null(),
461 Some(v) => builder.append_value(v),
462 }
463 }
464
465 builder.finish_into_varbinview().into_data()
466 }
467
468 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
469 let iter = iter.into_iter();
470 let mut builder = VarBinViewBuilder::with_capacity(
471 DType::Utf8(Nullability::NonNullable),
472 iter.size_hint().0,
473 );
474
475 for item in iter {
476 builder.append_value(item.as_ref());
477 }
478
479 builder.finish_into_varbinview().into_data()
480 }
481
482 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
483 iter: I,
484 ) -> Self {
485 let iter = iter.into_iter();
486 let mut builder = VarBinViewBuilder::with_capacity(
487 DType::Utf8(Nullability::Nullable),
488 iter.size_hint().0,
489 );
490
491 for item in iter {
492 match item {
493 None => builder.append_null(),
494 Some(v) => builder.append_value(v.as_ref()),
495 }
496 }
497
498 builder.finish_into_varbinview().into_data()
499 }
500
501 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
502 let iter = iter.into_iter();
503 let mut builder = VarBinViewBuilder::with_capacity(
504 DType::Binary(Nullability::NonNullable),
505 iter.size_hint().0,
506 );
507
508 for item in iter {
509 builder.append_value(item.as_ref());
510 }
511
512 builder.finish_into_varbinview().into_data()
513 }
514
515 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
516 iter: I,
517 ) -> Self {
518 let iter = iter.into_iter();
519 let mut builder = VarBinViewBuilder::with_capacity(
520 DType::Binary(Nullability::Nullable),
521 iter.size_hint().0,
522 );
523
524 for item in iter {
525 match item {
526 None => builder.append_null(),
527 Some(v) => builder.append_value(v.as_ref()),
528 }
529 }
530
531 builder.finish_into_varbinview().into_data()
532 }
533}
534
535pub trait VarBinViewArrayExt: TypedArrayRef<VarBinView> {
536 fn dtype_parts(&self) -> (bool, Nullability) {
537 match self.as_ref().dtype() {
538 DType::Utf8(nullability) => (true, *nullability),
539 DType::Binary(nullability) => (false, *nullability),
540 _ => unreachable!("VarBinViewArrayExt requires a utf8 or binary dtype"),
541 }
542 }
543
544 fn varbinview_validity(&self) -> Validity {
545 child_to_validity(
546 self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
547 self.dtype_parts().1,
548 )
549 }
550}
551impl<T: TypedArrayRef<VarBinView>> VarBinViewArrayExt for T {}
552
553impl Array<VarBinView> {
554 #[inline]
555 fn from_prevalidated_data(dtype: DType, data: VarBinViewData, slots: ArraySlots) -> Self {
556 let len = data.len();
557 unsafe {
558 Array::from_parts_unchecked(
559 ArrayParts::new(VarBinView, dtype, len, data).with_slots(slots),
560 )
561 }
562 }
563
564 #[expect(
566 clippy::same_name_method,
567 reason = "intentionally named from_iter like Iterator::from_iter"
568 )]
569 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
570 iter: I,
571 dtype: DType,
572 ) -> Self {
573 let iter = iter.into_iter();
574 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
575 for value in iter {
576 match value {
577 Some(value) => builder.append_value(value),
578 None => builder.append_null(),
579 }
580 }
581 builder.finish_into_varbinview()
582 }
583
584 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
585 let iter = iter.into_iter();
586 let mut builder = VarBinViewBuilder::with_capacity(
587 DType::Utf8(Nullability::NonNullable),
588 iter.size_hint().0,
589 );
590 for value in iter {
591 builder.append_value(value.as_ref());
592 }
593 builder.finish_into_varbinview()
594 }
595
596 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
597 iter: I,
598 ) -> Self {
599 let iter = iter.into_iter();
600 let mut builder = VarBinViewBuilder::with_capacity(
601 DType::Utf8(Nullability::Nullable),
602 iter.size_hint().0,
603 );
604 for value in iter {
605 match value {
606 Some(value) => builder.append_value(value.as_ref()),
607 None => builder.append_null(),
608 }
609 }
610 builder.finish_into_varbinview()
611 }
612
613 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
614 let iter = iter.into_iter();
615 let mut builder = VarBinViewBuilder::with_capacity(
616 DType::Binary(Nullability::NonNullable),
617 iter.size_hint().0,
618 );
619 for value in iter {
620 builder.append_value(value.as_ref());
621 }
622 builder.finish_into_varbinview()
623 }
624
625 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
626 iter: I,
627 ) -> Self {
628 let iter = iter.into_iter();
629 let mut builder = VarBinViewBuilder::with_capacity(
630 DType::Binary(Nullability::Nullable),
631 iter.size_hint().0,
632 );
633 for value in iter {
634 match value {
635 Some(value) => builder.append_value(value.as_ref()),
636 None => builder.append_null(),
637 }
638 }
639 builder.finish_into_varbinview()
640 }
641
642 pub fn try_new(
644 views: Buffer<BinaryView>,
645 buffers: Arc<[ByteBuffer]>,
646 dtype: DType,
647 validity: Validity,
648 ) -> VortexResult<Self> {
649 let data = VarBinViewData::try_new(views, buffers, dtype.clone(), validity.clone())?;
650 let slots = VarBinViewData::make_slots(&validity, data.len());
651 Ok(Self::from_prevalidated_data(dtype, data, slots))
652 }
653
654 pub unsafe fn new_unchecked(
660 views: Buffer<BinaryView>,
661 buffers: Arc<[ByteBuffer]>,
662 dtype: DType,
663 validity: Validity,
664 ) -> Self {
665 let data = unsafe {
666 VarBinViewData::new_unchecked(views, buffers, dtype.clone(), validity.clone())
667 };
668 let slots = VarBinViewData::make_slots(&validity, data.len());
669 Self::from_prevalidated_data(dtype, data, slots)
670 }
671
672 pub fn new_handle(
674 views: BufferHandle,
675 buffers: Arc<[BufferHandle]>,
676 dtype: DType,
677 validity: Validity,
678 ) -> Self {
679 let data = VarBinViewData::new_handle(views, buffers, dtype.clone(), validity.clone());
680 let slots = VarBinViewData::make_slots(&validity, data.len());
681 Self::from_prevalidated_data(dtype, data, slots)
682 }
683
684 pub unsafe fn new_handle_unchecked(
690 views: BufferHandle,
691 buffers: Arc<[BufferHandle]>,
692 dtype: DType,
693 validity: Validity,
694 ) -> Self {
695 let data = unsafe {
696 VarBinViewData::new_handle_unchecked(views, buffers, dtype.clone(), validity.clone())
697 };
698 let slots = VarBinViewData::make_slots(&validity, data.len());
699 Self::from_prevalidated_data(dtype, data, slots)
700 }
701
702 pub fn into_data_parts(self) -> VarBinViewDataParts {
703 let dtype = self.dtype().clone();
704 let validity = self.varbinview_validity();
705 let data = self.into_data();
706 VarBinViewDataParts {
707 dtype,
708 buffers: data.buffers,
709 views: data.views,
710 validity,
711 }
712 }
713}
714
715impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewData {
716 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
717 Self::from_iter_nullable_bin(iter)
718 }
719}
720
721impl FromIterator<Option<Vec<u8>>> for VarBinViewData {
722 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
723 Self::from_iter_nullable_bin(iter)
724 }
725}
726
727impl FromIterator<Option<String>> for VarBinViewData {
728 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
729 Self::from_iter_nullable_str(iter)
730 }
731}
732
733impl<'a> FromIterator<Option<&'a str>> for VarBinViewData {
734 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
735 Self::from_iter_nullable_str(iter)
736 }
737}
738
739impl<'a> FromIterator<Option<&'a [u8]>> for Array<VarBinView> {
742 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
743 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
744 }
745}
746
747impl FromIterator<Option<Vec<u8>>> for Array<VarBinView> {
748 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
749 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
750 }
751}
752
753impl FromIterator<Option<String>> for Array<VarBinView> {
754 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
755 Self::from_iter_nullable_str(iter)
756 }
757}
758
759impl<'a> FromIterator<Option<&'a str>> for Array<VarBinView> {
760 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
761 Self::from_iter_nullable_str(iter)
762 }
763}