1use std::fmt::Display;
5use std::fmt::Formatter;
6use std::mem::size_of;
7use std::sync::Arc;
8
9use vortex_buffer::Alignment;
10use vortex_buffer::Buffer;
11use vortex_buffer::ByteBuffer;
12use vortex_error::VortexExpect;
13use vortex_error::VortexResult;
14use vortex_error::vortex_bail;
15use vortex_error::vortex_ensure;
16use vortex_error::vortex_err;
17use vortex_error::vortex_panic;
18
19use crate::ArrayRef;
20use crate::array::Array;
21use crate::array::ArrayParts;
22use crate::array::TypedArrayRef;
23use crate::array::child_to_validity;
24use crate::array::validity_to_child;
25use crate::arrays::VarBinView;
26use crate::arrays::varbinview::BinaryView;
27use crate::buffer::BufferHandle;
28use crate::builders::ArrayBuilder;
29use crate::builders::VarBinViewBuilder;
30use crate::dtype::DType;
31use crate::dtype::Nullability;
32use crate::validity::Validity;
33
34pub(super) const VALIDITY_SLOT: usize = 0;
36pub(super) const NUM_SLOTS: usize = 1;
37pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
38
39#[derive(Clone, Debug)]
99pub struct VarBinViewData {
100 pub(super) buffers: Arc<[BufferHandle]>,
101 pub(super) views: BufferHandle,
102}
103
104impl Display for VarBinViewData {
105 fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
106 Ok(())
107 }
108}
109
110pub struct VarBinViewDataParts {
111 pub dtype: DType,
112 pub buffers: Arc<[BufferHandle]>,
113 pub views: BufferHandle,
114 pub validity: Validity,
115}
116
117impl VarBinViewData {
118 fn dtype_parts(dtype: &DType) -> VortexResult<(bool, Nullability)> {
119 match dtype {
120 DType::Utf8(nullability) => Ok((true, *nullability)),
121 DType::Binary(nullability) => Ok((false, *nullability)),
122 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
123 }
124 }
125
126 pub(super) fn make_slots(validity: &Validity, len: usize) -> Vec<Option<ArrayRef>> {
128 vec![validity_to_child(validity, len)]
129 }
130
131 pub fn new(
138 views: Buffer<BinaryView>,
139 buffers: Arc<[ByteBuffer]>,
140 dtype: DType,
141 validity: Validity,
142 ) -> Self {
143 Self::try_new(views, buffers, dtype, validity)
144 .vortex_expect("VarBinViewArray construction failed")
145 }
146
147 pub fn new_handle(
154 views: BufferHandle,
155 buffers: Arc<[BufferHandle]>,
156 dtype: DType,
157 validity: Validity,
158 ) -> Self {
159 Self::try_new_handle(views, buffers, dtype, validity)
160 .vortex_expect("VarbinViewArray construction failed")
161 }
162
163 pub fn try_new(
172 views: Buffer<BinaryView>,
173 buffers: Arc<[ByteBuffer]>,
174 dtype: DType,
175 validity: Validity,
176 ) -> VortexResult<Self> {
177 Self::validate(&views, &buffers, &dtype, &validity)?;
178
179 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
181 }
182
183 pub fn try_new_handle(
192 views: BufferHandle,
193 buffers: Arc<[BufferHandle]>,
194 dtype: DType,
195 validity: Validity,
196 ) -> VortexResult<Self> {
197 let views_nbytes = views.len();
198 vortex_ensure!(
199 views_nbytes.is_multiple_of(size_of::<BinaryView>()),
200 "Expected views buffer length ({views_nbytes}) to be a multiple of {}",
201 size_of::<BinaryView>()
202 );
203
204 if let Some(host) = views.as_host_opt() {
206 vortex_ensure!(
207 host.is_aligned(Alignment::of::<BinaryView>()),
208 "Views on host must be 16 byte aligned"
209 );
210 }
211
212 Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
214 }
215
216 pub unsafe fn new_unchecked(
246 views: Buffer<BinaryView>,
247 buffers: Arc<[ByteBuffer]>,
248 dtype: DType,
249 validity: Validity,
250 ) -> Self {
251 #[cfg(debug_assertions)]
252 Self::validate(&views, &buffers, &dtype, &validity)
253 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
254
255 let handles: Vec<BufferHandle> = buffers
256 .iter()
257 .cloned()
258 .map(BufferHandle::new_host)
259 .collect();
260
261 let handles = Arc::from(handles);
262 let view_handle = BufferHandle::new_host(views.into_byte_buffer());
263 unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
264 }
265
266 pub unsafe fn new_handle_unchecked(
272 views: BufferHandle,
273 buffers: Arc<[BufferHandle]>,
274 dtype: DType,
275 _validity: Validity,
276 ) -> Self {
277 let _ =
278 Self::dtype_parts(&dtype).vortex_expect("VarBinViewArray dtype must be utf8 or binary");
279 Self { buffers, views }
280 }
281
282 pub fn validate(
286 views: &Buffer<BinaryView>,
287 buffers: &Arc<[ByteBuffer]>,
288 dtype: &DType,
289 validity: &Validity,
290 ) -> VortexResult<()> {
291 vortex_ensure!(
292 validity.nullability() == dtype.nullability(),
293 InvalidArgument: "validity {:?} incompatible with nullability {:?}",
294 validity,
295 dtype.nullability()
296 );
297
298 match dtype {
299 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
300 simdutf8::basic::from_utf8(string).is_ok()
301 })?,
302 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
303 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
304 }
305
306 Ok(())
307 }
308
309 fn validate_views<F>(
310 views: &Buffer<BinaryView>,
311 buffers: &Arc<[ByteBuffer]>,
312 validity: &Validity,
313 validator: F,
314 ) -> VortexResult<()>
315 where
316 F: Fn(&[u8]) -> bool,
317 {
318 for (idx, &view) in views.iter().enumerate() {
319 if validity.is_null(idx)? {
320 continue;
321 }
322
323 if view.is_inlined() {
324 let bytes = &view.as_inlined().data[..view.len() as usize];
326 vortex_ensure!(
327 validator(bytes),
328 InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
329 );
330 } else {
331 let view = view.as_view();
333 let buf_index = view.buffer_index as usize;
334 let start_offset = view.offset as usize;
335 let end_offset = start_offset.saturating_add(view.size as usize);
336
337 let buf = buffers.get(buf_index).ok_or_else(||
338 vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewData with {} buffers",
339 buffers.len()))?;
340
341 vortex_ensure!(
342 start_offset < buf.len(),
343 InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
344 buf.len(),
345 );
346
347 vortex_ensure!(
348 end_offset <= buf.len(),
349 InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
350 buf.len(),
351 );
352
353 let bytes = &buf[start_offset..end_offset];
355 vortex_ensure!(
356 view.prefix == bytes[..4],
357 InvalidArgument: "VarBinView prefix does not match full string"
358 );
359
360 vortex_ensure!(
362 validator(bytes),
363 InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
364 );
365 }
366 }
367
368 Ok(())
369 }
370
371 pub fn len(&self) -> usize {
373 self.views.len() / size_of::<BinaryView>()
374 }
375
376 pub fn is_empty(&self) -> bool {
378 self.len() == 0
379 }
380
381 #[inline]
387 pub fn views(&self) -> &[BinaryView] {
388 let host_views = self.views.as_host();
389 let len = host_views.len() / size_of::<BinaryView>();
390
391 unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
393 }
394
395 pub fn views_handle(&self) -> &BufferHandle {
397 &self.views
398 }
399
400 #[inline]
404 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
405 let views = self.views();
406 let view = &views[index];
407 if !view.is_inlined() {
409 let view_ref = view.as_view();
410 self.buffer(view_ref.buffer_index as usize)
411 .slice(view_ref.as_range())
412 } else {
413 self.views_handle()
415 .as_host()
416 .clone()
417 .into_byte_buffer()
418 .slice_ref(view.as_inlined().value())
419 }
420 }
421
422 #[inline]
429 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
430 if idx >= self.data_buffers().len() {
431 vortex_panic!(
432 "{idx} buffer index out of bounds, there are {} buffers",
433 self.data_buffers().len()
434 );
435 }
436 self.buffers[idx].as_host()
437 }
438
439 #[inline]
441 pub fn data_buffers(&self) -> &Arc<[BufferHandle]> {
442 &self.buffers
443 }
444
445 #[expect(
447 clippy::same_name_method,
448 reason = "intentionally named from_iter like Iterator::from_iter"
449 )]
450 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
451 iter: I,
452 dtype: DType,
453 ) -> Self {
454 let iter = iter.into_iter();
455 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
456
457 for item in iter {
458 match item {
459 None => builder.append_null(),
460 Some(v) => builder.append_value(v),
461 }
462 }
463
464 builder.finish_into_varbinview().into_data()
465 }
466
467 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
468 let iter = iter.into_iter();
469 let mut builder = VarBinViewBuilder::with_capacity(
470 DType::Utf8(Nullability::NonNullable),
471 iter.size_hint().0,
472 );
473
474 for item in iter {
475 builder.append_value(item.as_ref());
476 }
477
478 builder.finish_into_varbinview().into_data()
479 }
480
481 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
482 iter: I,
483 ) -> Self {
484 let iter = iter.into_iter();
485 let mut builder = VarBinViewBuilder::with_capacity(
486 DType::Utf8(Nullability::Nullable),
487 iter.size_hint().0,
488 );
489
490 for item in iter {
491 match item {
492 None => builder.append_null(),
493 Some(v) => builder.append_value(v.as_ref()),
494 }
495 }
496
497 builder.finish_into_varbinview().into_data()
498 }
499
500 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
501 let iter = iter.into_iter();
502 let mut builder = VarBinViewBuilder::with_capacity(
503 DType::Binary(Nullability::NonNullable),
504 iter.size_hint().0,
505 );
506
507 for item in iter {
508 builder.append_value(item.as_ref());
509 }
510
511 builder.finish_into_varbinview().into_data()
512 }
513
514 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
515 iter: I,
516 ) -> Self {
517 let iter = iter.into_iter();
518 let mut builder = VarBinViewBuilder::with_capacity(
519 DType::Binary(Nullability::Nullable),
520 iter.size_hint().0,
521 );
522
523 for item in iter {
524 match item {
525 None => builder.append_null(),
526 Some(v) => builder.append_value(v.as_ref()),
527 }
528 }
529
530 builder.finish_into_varbinview().into_data()
531 }
532}
533
534pub trait VarBinViewArrayExt: TypedArrayRef<VarBinView> {
535 fn dtype_parts(&self) -> (bool, Nullability) {
536 match self.as_ref().dtype() {
537 DType::Utf8(nullability) => (true, *nullability),
538 DType::Binary(nullability) => (false, *nullability),
539 _ => unreachable!("VarBinViewArrayExt requires a utf8 or binary dtype"),
540 }
541 }
542
543 fn varbinview_validity(&self) -> Validity {
544 child_to_validity(&self.as_ref().slots()[VALIDITY_SLOT], self.dtype_parts().1)
545 }
546}
547impl<T: TypedArrayRef<VarBinView>> VarBinViewArrayExt for T {}
548
549impl Array<VarBinView> {
550 #[inline]
551 fn from_prevalidated_data(
552 dtype: DType,
553 data: VarBinViewData,
554 slots: Vec<Option<ArrayRef>>,
555 ) -> Self {
556 let len = data.len();
557 unsafe {
558 Array::from_parts_unchecked(
559 ArrayParts::new(VarBinView, dtype, len, data).with_slots(slots),
560 )
561 }
562 }
563
564 #[expect(
566 clippy::same_name_method,
567 reason = "intentionally named from_iter like Iterator::from_iter"
568 )]
569 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
570 iter: I,
571 dtype: DType,
572 ) -> Self {
573 let iter = iter.into_iter();
574 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
575 for value in iter {
576 match value {
577 Some(value) => builder.append_value(value),
578 None => builder.append_null(),
579 }
580 }
581 builder.finish_into_varbinview()
582 }
583
584 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
585 let iter = iter.into_iter();
586 let mut builder = VarBinViewBuilder::with_capacity(
587 DType::Utf8(Nullability::NonNullable),
588 iter.size_hint().0,
589 );
590 for value in iter {
591 builder.append_value(value.as_ref());
592 }
593 builder.finish_into_varbinview()
594 }
595
596 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
597 iter: I,
598 ) -> Self {
599 let iter = iter.into_iter();
600 let mut builder = VarBinViewBuilder::with_capacity(
601 DType::Utf8(Nullability::Nullable),
602 iter.size_hint().0,
603 );
604 for value in iter {
605 match value {
606 Some(value) => builder.append_value(value.as_ref()),
607 None => builder.append_null(),
608 }
609 }
610 builder.finish_into_varbinview()
611 }
612
613 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
614 let iter = iter.into_iter();
615 let mut builder = VarBinViewBuilder::with_capacity(
616 DType::Binary(Nullability::NonNullable),
617 iter.size_hint().0,
618 );
619 for value in iter {
620 builder.append_value(value.as_ref());
621 }
622 builder.finish_into_varbinview()
623 }
624
625 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
626 iter: I,
627 ) -> Self {
628 let iter = iter.into_iter();
629 let mut builder = VarBinViewBuilder::with_capacity(
630 DType::Binary(Nullability::Nullable),
631 iter.size_hint().0,
632 );
633 for value in iter {
634 match value {
635 Some(value) => builder.append_value(value.as_ref()),
636 None => builder.append_null(),
637 }
638 }
639 builder.finish_into_varbinview()
640 }
641
642 pub fn try_new(
644 views: Buffer<BinaryView>,
645 buffers: Arc<[ByteBuffer]>,
646 dtype: DType,
647 validity: Validity,
648 ) -> VortexResult<Self> {
649 let data = VarBinViewData::try_new(views, buffers, dtype.clone(), validity.clone())?;
650 let slots = VarBinViewData::make_slots(&validity, data.len());
651 Ok(Self::from_prevalidated_data(dtype, data, slots))
652 }
653
654 pub unsafe fn new_unchecked(
660 views: Buffer<BinaryView>,
661 buffers: Arc<[ByteBuffer]>,
662 dtype: DType,
663 validity: Validity,
664 ) -> Self {
665 let data = unsafe {
666 VarBinViewData::new_unchecked(views, buffers, dtype.clone(), validity.clone())
667 };
668 let slots = VarBinViewData::make_slots(&validity, data.len());
669 Self::from_prevalidated_data(dtype, data, slots)
670 }
671
672 pub fn new_handle(
674 views: BufferHandle,
675 buffers: Arc<[BufferHandle]>,
676 dtype: DType,
677 validity: Validity,
678 ) -> Self {
679 let data = VarBinViewData::new_handle(views, buffers, dtype.clone(), validity.clone());
680 let slots = VarBinViewData::make_slots(&validity, data.len());
681 Self::from_prevalidated_data(dtype, data, slots)
682 }
683
684 pub unsafe fn new_handle_unchecked(
690 views: BufferHandle,
691 buffers: Arc<[BufferHandle]>,
692 dtype: DType,
693 validity: Validity,
694 ) -> Self {
695 let data = unsafe {
696 VarBinViewData::new_handle_unchecked(views, buffers, dtype.clone(), validity.clone())
697 };
698 let slots = VarBinViewData::make_slots(&validity, data.len());
699 Self::from_prevalidated_data(dtype, data, slots)
700 }
701
702 pub fn into_data_parts(self) -> VarBinViewDataParts {
703 let dtype = self.dtype().clone();
704 let validity = self.varbinview_validity();
705 let data = self.into_data();
706 VarBinViewDataParts {
707 dtype,
708 buffers: data.buffers,
709 views: data.views,
710 validity,
711 }
712 }
713}
714
715impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewData {
716 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
717 Self::from_iter_nullable_bin(iter)
718 }
719}
720
721impl FromIterator<Option<Vec<u8>>> for VarBinViewData {
722 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
723 Self::from_iter_nullable_bin(iter)
724 }
725}
726
727impl FromIterator<Option<String>> for VarBinViewData {
728 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
729 Self::from_iter_nullable_str(iter)
730 }
731}
732
733impl<'a> FromIterator<Option<&'a str>> for VarBinViewData {
734 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
735 Self::from_iter_nullable_str(iter)
736 }
737}
738
739impl<'a> FromIterator<Option<&'a [u8]>> for Array<VarBinView> {
742 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
743 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
744 }
745}
746
747impl FromIterator<Option<Vec<u8>>> for Array<VarBinView> {
748 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
749 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
750 }
751}
752
753impl FromIterator<Option<String>> for Array<VarBinView> {
754 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
755 Self::from_iter_nullable_str(iter)
756 }
757}
758
759impl<'a> FromIterator<Option<&'a str>> for Array<VarBinView> {
760 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
761 Self::from_iter_nullable_str(iter)
762 }
763}