1use std::fmt::{Debug, Formatter};
5use std::hash::{Hash, Hasher};
6use std::ops::Range;
7use std::sync::Arc;
8
9use static_assertions::{assert_eq_align, assert_eq_size};
10use vortex_buffer::{Buffer, ByteBuffer};
11use vortex_dtype::{DType, Nullability};
12use vortex_error::{
13 VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
14};
15
16use crate::builders::{ArrayBuilder, VarBinViewBuilder};
17use crate::stats::{ArrayStats, StatsSetRef};
18use crate::validity::Validity;
19use crate::vtable::{
20 ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
21 ValidityVTableFromValidityHelper,
22};
23use crate::{Canonical, EncodingId, EncodingRef, vtable};
24
25mod accessor;
26mod compact;
27mod compute;
28mod ops;
29mod serde;
30
31#[derive(Clone, Copy, Debug, PartialEq, Eq)]
32#[repr(C, align(8))]
33pub struct Inlined {
34 size: u32,
35 data: [u8; BinaryView::MAX_INLINED_SIZE],
36}
37
38impl Inlined {
39 fn new<const N: usize>(value: &[u8]) -> Self {
40 let mut inlined = Self {
41 size: N.try_into().vortex_unwrap(),
42 data: [0u8; BinaryView::MAX_INLINED_SIZE],
43 };
44 inlined.data[..N].copy_from_slice(&value[..N]);
45 inlined
46 }
47
48 #[inline]
49 pub fn value(&self) -> &[u8] {
50 &self.data[0..(self.size as usize)]
51 }
52}
53
54#[derive(Clone, Copy, Debug)]
55#[repr(C, align(8))]
56pub struct Ref {
57 size: u32,
58 prefix: [u8; 4],
59 buffer_index: u32,
60 offset: u32,
61}
62
63impl Ref {
64 pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
65 Self {
66 size,
67 prefix,
68 buffer_index,
69 offset,
70 }
71 }
72
73 #[inline]
74 pub fn buffer_index(&self) -> u32 {
75 self.buffer_index
76 }
77
78 #[inline]
79 pub fn offset(&self) -> u32 {
80 self.offset
81 }
82
83 #[inline]
84 pub fn prefix(&self) -> &[u8; 4] {
85 &self.prefix
86 }
87
88 #[inline]
89 pub fn to_range(&self) -> Range<usize> {
90 self.offset as usize..(self.offset + self.size) as usize
91 }
92}
93
94#[derive(Clone, Copy)]
95#[repr(C, align(16))]
96pub union BinaryView {
97 le_bytes: [u8; 16],
100
101 inlined: Inlined,
103
104 _ref: Ref,
106}
107
108assert_eq_size!(BinaryView, [u8; 16]);
109assert_eq_size!(Inlined, [u8; 16]);
110assert_eq_size!(Ref, [u8; 16]);
111assert_eq_align!(BinaryView, u128);
112
113impl Hash for BinaryView {
114 fn hash<H: Hasher>(&self, state: &mut H) {
115 unsafe { std::mem::transmute::<&BinaryView, &[u8; 16]>(self) }.hash(state);
116 }
117}
118
119impl Default for BinaryView {
120 fn default() -> Self {
121 Self::make_view(&[], 0, 0)
122 }
123}
124
125impl BinaryView {
126 pub const MAX_INLINED_SIZE: usize = 12;
127
128 #[inline(never)]
136 pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
137 match value.len() {
138 0 => Self {
139 inlined: Inlined::new::<0>(value),
140 },
141 1 => Self {
142 inlined: Inlined::new::<1>(value),
143 },
144 2 => Self {
145 inlined: Inlined::new::<2>(value),
146 },
147 3 => Self {
148 inlined: Inlined::new::<3>(value),
149 },
150 4 => Self {
151 inlined: Inlined::new::<4>(value),
152 },
153 5 => Self {
154 inlined: Inlined::new::<5>(value),
155 },
156 6 => Self {
157 inlined: Inlined::new::<6>(value),
158 },
159 7 => Self {
160 inlined: Inlined::new::<7>(value),
161 },
162 8 => Self {
163 inlined: Inlined::new::<8>(value),
164 },
165 9 => Self {
166 inlined: Inlined::new::<9>(value),
167 },
168 10 => Self {
169 inlined: Inlined::new::<10>(value),
170 },
171 11 => Self {
172 inlined: Inlined::new::<11>(value),
173 },
174 12 => Self {
175 inlined: Inlined::new::<12>(value),
176 },
177 _ => Self {
178 _ref: Ref::new(
179 u32::try_from(value.len()).vortex_unwrap(),
180 value[0..4].try_into().vortex_unwrap(),
181 block,
182 offset,
183 ),
184 },
185 }
186 }
187
188 #[inline]
190 pub fn empty_view() -> Self {
191 Self::new_inlined(&[])
192 }
193
194 #[inline]
196 pub fn new_inlined(value: &[u8]) -> Self {
197 assert!(
198 value.len() <= Self::MAX_INLINED_SIZE,
199 "expected inlined value to be <= 12 bytes, was {}",
200 value.len()
201 );
202
203 Self::make_view(value, 0, 0)
204 }
205
206 #[inline]
207 pub fn len(&self) -> u32 {
208 unsafe { self.inlined.size }
209 }
210
211 #[inline]
212 pub fn is_empty(&self) -> bool {
213 self.len() > 0
214 }
215
216 #[inline]
217 #[allow(clippy::cast_possible_truncation)]
218 pub fn is_inlined(&self) -> bool {
219 self.len() <= (Self::MAX_INLINED_SIZE as u32)
220 }
221
222 pub fn as_inlined(&self) -> &Inlined {
223 unsafe { &self.inlined }
224 }
225
226 pub fn as_view(&self) -> &Ref {
227 unsafe { &self._ref }
228 }
229
230 pub fn as_u128(&self) -> u128 {
231 unsafe { u128::from_le_bytes(self.le_bytes) }
233 }
234
235 #[inline(always)]
237 pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
238 if self.is_inlined() {
239 self
240 } else {
241 let view_ref = self.as_view();
243 Self {
244 _ref: Ref::new(
245 self.len(),
246 *view_ref.prefix(),
247 buffer_idx,
248 view_ref.offset(),
249 ),
250 }
251 }
252 }
253
254 #[inline(always)]
257 pub fn offset_view(self, offset: u32) -> Self {
258 if self.is_inlined() {
259 self
260 } else {
261 let view_ref = self.as_view();
263 Self {
264 _ref: Ref::new(
265 self.len(),
266 *view_ref.prefix(),
267 offset + view_ref.buffer_index(),
268 view_ref.offset(),
269 ),
270 }
271 }
272 }
273}
274
275impl From<u128> for BinaryView {
276 fn from(value: u128) -> Self {
277 BinaryView {
278 le_bytes: value.to_le_bytes(),
279 }
280 }
281}
282
283impl Debug for BinaryView {
284 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
285 let mut s = f.debug_struct("BinaryView");
286 if self.is_inlined() {
287 s.field("inline", &"i".to_string());
288 } else {
289 s.field("ref", &"r".to_string());
290 }
291 s.finish()
292 }
293}
294
295vtable!(VarBinView);
296
297impl VTable for VarBinViewVTable {
298 type Array = VarBinViewArray;
299 type Encoding = VarBinViewEncoding;
300
301 type ArrayVTable = Self;
302 type CanonicalVTable = Self;
303 type OperationsVTable = Self;
304 type ValidityVTable = ValidityVTableFromValidityHelper;
305 type VisitorVTable = Self;
306 type ComputeVTable = NotSupported;
307 type EncodeVTable = NotSupported;
308 type PipelineVTable = NotSupported;
309 type SerdeVTable = Self;
310
311 fn id(_encoding: &Self::Encoding) -> EncodingId {
312 EncodingId::new_ref("vortex.varbinview")
313 }
314
315 fn encoding(_array: &Self::Array) -> EncodingRef {
316 EncodingRef::new_ref(VarBinViewEncoding.as_ref())
317 }
318}
319
320#[derive(Clone, Debug)]
380pub struct VarBinViewArray {
381 dtype: DType,
382 buffers: Arc<[ByteBuffer]>,
383 views: Buffer<BinaryView>,
384 validity: Validity,
385 stats_set: ArrayStats,
386}
387
388#[derive(Clone, Debug)]
389pub struct VarBinViewEncoding;
390
391impl VarBinViewArray {
392 pub fn new(
399 views: Buffer<BinaryView>,
400 buffers: Arc<[ByteBuffer]>,
401 dtype: DType,
402 validity: Validity,
403 ) -> Self {
404 Self::try_new(views, buffers, dtype, validity)
405 .vortex_expect("VarBinViewArray construction failed")
406 }
407
408 pub fn try_new(
417 views: Buffer<BinaryView>,
418 buffers: Arc<[ByteBuffer]>,
419 dtype: DType,
420 validity: Validity,
421 ) -> VortexResult<Self> {
422 Self::validate(&views, &buffers, &dtype, &validity)?;
423
424 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
426 }
427
428 pub unsafe fn new_unchecked(
458 views: Buffer<BinaryView>,
459 buffers: Arc<[ByteBuffer]>,
460 dtype: DType,
461 validity: Validity,
462 ) -> Self {
463 Self {
464 dtype,
465 buffers,
466 views,
467 validity,
468 stats_set: Default::default(),
469 }
470 }
471
472 pub(crate) fn validate(
476 views: &Buffer<BinaryView>,
477 buffers: &Arc<[ByteBuffer]>,
478 dtype: &DType,
479 validity: &Validity,
480 ) -> VortexResult<()> {
481 vortex_ensure!(
482 validity.nullability() == dtype.nullability(),
483 "validity {:?} incompatible with nullability {:?}",
484 validity,
485 dtype.nullability()
486 );
487
488 match dtype {
489 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
490 simdutf8::basic::from_utf8(string).is_ok()
491 })?,
492 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
493 _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
494 }
495
496 Ok(())
497 }
498
499 fn validate_views<F>(
500 views: &Buffer<BinaryView>,
501 buffers: &Arc<[ByteBuffer]>,
502 validity: &Validity,
503 validator: F,
504 ) -> VortexResult<()>
505 where
506 F: Fn(&[u8]) -> bool,
507 {
508 for (idx, &view) in views.iter().enumerate() {
509 if validity.is_null(idx) {
510 continue;
511 }
512
513 if view.is_inlined() {
514 let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
516 vortex_ensure!(
517 validator(bytes),
518 "view at index {idx}: inlined bytes failed utf-8 validation"
519 );
520 } else {
521 let view = view.as_view();
523 let buf_index = view.buffer_index as usize;
524 let start_offset = view.offset as usize;
525 let end_offset = start_offset.saturating_add(view.size as usize);
526
527 let buf = buffers.get(buf_index).ok_or_else(||
528 vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
529 buffers.len()))?;
530
531 vortex_ensure!(
532 start_offset < buf.len(),
533 "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
534 buf.len(),
535 );
536
537 vortex_ensure!(
538 end_offset <= buf.len(),
539 "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
540 buf.len(),
541 );
542
543 let bytes = &buf[start_offset..end_offset];
545 vortex_ensure!(
546 view.prefix == bytes[..4],
547 "VarBinView prefix does not match full string"
548 );
549
550 vortex_ensure!(
552 validator(bytes),
553 "view at index {idx}: outlined bytes fails utf-8 validation"
554 );
555 }
556 }
557
558 Ok(())
559 }
560
561 pub fn nbuffers(&self) -> usize {
563 self.buffers.len()
564 }
565
566 #[inline]
572 pub fn views(&self) -> &Buffer<BinaryView> {
573 &self.views
574 }
575
576 #[inline]
580 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
581 let views = self.views();
582 let view = &views[index];
583 if !view.is_inlined() {
585 let view_ref = view.as_view();
586 self.buffer(view_ref.buffer_index() as usize)
587 .slice(view_ref.to_range())
588 } else {
589 views
591 .clone()
592 .into_byte_buffer()
593 .slice_ref(view.as_inlined().value())
594 }
595 }
596
597 #[inline]
604 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
605 if idx >= self.nbuffers() {
606 vortex_panic!(
607 "{idx} buffer index out of bounds, there are {} buffers",
608 self.nbuffers()
609 );
610 }
611 &self.buffers[idx]
612 }
613
614 #[inline]
616 pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
617 &self.buffers
618 }
619
620 #[allow(clippy::same_name_method)]
622 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
623 iter: I,
624 dtype: DType,
625 ) -> Self {
626 let iter = iter.into_iter();
627 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
628
629 for item in iter {
630 match item {
631 None => builder.append_null(),
632 Some(v) => builder.append_value(v),
633 }
634 }
635
636 builder.finish_into_varbinview()
637 }
638
639 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
640 let iter = iter.into_iter();
641 let mut builder = VarBinViewBuilder::with_capacity(
642 DType::Utf8(Nullability::NonNullable),
643 iter.size_hint().0,
644 );
645
646 for item in iter {
647 builder.append_value(item.as_ref());
648 }
649
650 builder.finish_into_varbinview()
651 }
652
653 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
654 iter: I,
655 ) -> Self {
656 let iter = iter.into_iter();
657 let mut builder = VarBinViewBuilder::with_capacity(
658 DType::Utf8(Nullability::Nullable),
659 iter.size_hint().0,
660 );
661
662 for item in iter {
663 match item {
664 None => builder.append_null(),
665 Some(v) => builder.append_value(v.as_ref()),
666 }
667 }
668
669 builder.finish_into_varbinview()
670 }
671
672 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
673 let iter = iter.into_iter();
674 let mut builder = VarBinViewBuilder::with_capacity(
675 DType::Binary(Nullability::NonNullable),
676 iter.size_hint().0,
677 );
678
679 for item in iter {
680 builder.append_value(item.as_ref());
681 }
682
683 builder.finish_into_varbinview()
684 }
685
686 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
687 iter: I,
688 ) -> Self {
689 let iter = iter.into_iter();
690 let mut builder = VarBinViewBuilder::with_capacity(
691 DType::Binary(Nullability::Nullable),
692 iter.size_hint().0,
693 );
694
695 for item in iter {
696 match item {
697 None => builder.append_null(),
698 Some(v) => builder.append_value(v.as_ref()),
699 }
700 }
701
702 builder.finish_into_varbinview()
703 }
704}
705
706impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
707 fn len(array: &VarBinViewArray) -> usize {
708 array.views.len()
709 }
710
711 fn dtype(array: &VarBinViewArray) -> &DType {
712 &array.dtype
713 }
714
715 fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
716 array.stats_set.to_ref(array.as_ref())
717 }
718}
719
720impl ValidityHelper for VarBinViewArray {
721 fn validity(&self) -> &Validity {
722 &self.validity
723 }
724}
725
726impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
727 fn canonicalize(array: &VarBinViewArray) -> Canonical {
728 Canonical::VarBinView(array.clone())
729 }
730
731 fn append_to_builder(array: &VarBinViewArray, builder: &mut dyn ArrayBuilder) {
732 builder.extend_from_array(array.as_ref())
733 }
734}
735
736impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
737 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
738 Self::from_iter_nullable_bin(iter)
739 }
740}
741
742impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
743 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
744 Self::from_iter_nullable_bin(iter)
745 }
746}
747
748impl FromIterator<Option<String>> for VarBinViewArray {
749 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
750 Self::from_iter_nullable_str(iter)
751 }
752}
753
754impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
755 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
756 Self::from_iter_nullable_str(iter)
757 }
758}
759
760#[cfg(test)]
761mod test {
762 use vortex_scalar::Scalar;
763
764 use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
765 use crate::{Array, ToCanonical};
766
767 #[test]
768 pub fn varbin_view() {
769 let binary_arr =
770 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
771 assert_eq!(binary_arr.len(), 2);
772 assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world"));
773 assert_eq!(
774 binary_arr.scalar_at(1),
775 Scalar::from("hello world this is a long string")
776 );
777 }
778
779 #[test]
780 pub fn slice_array() {
781 let binary_arr =
782 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
783 .slice(1..2);
784 assert_eq!(
785 binary_arr.scalar_at(0),
786 Scalar::from("hello world this is a long string")
787 );
788 }
789
790 #[test]
791 pub fn flatten_array() {
792 let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
793 let var_bin = binary_arr.to_varbinview();
794 assert_eq!(var_bin.scalar_at(0), Scalar::from("string1"));
795 assert_eq!(var_bin.scalar_at(1), Scalar::from("string2"));
796 }
797
798 #[test]
799 pub fn binary_view_size_and_alignment() {
800 assert_eq!(size_of::<BinaryView>(), 16);
801 assert_eq!(align_of::<BinaryView>(), 16);
802 }
803}