1use std::fmt::{Debug, Formatter};
5use std::hash::{Hash, Hasher};
6use std::ops::Range;
7use std::sync::Arc;
8
9use static_assertions::{assert_eq_align, assert_eq_size};
10use vortex_buffer::{Buffer, ByteBuffer};
11use vortex_dtype::{DType, Nullability};
12use vortex_error::{
13 VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
14};
15
16use crate::builders::{ArrayBuilder, VarBinViewBuilder};
17use crate::stats::{ArrayStats, StatsSetRef};
18use crate::validity::Validity;
19use crate::vtable::{
20 ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
21 ValidityVTableFromValidityHelper,
22};
23use crate::{Canonical, EncodingId, EncodingRef, vtable};
24
25mod accessor;
26mod compact;
27mod compute;
28mod ops;
29mod serde;
30
31#[derive(Clone, Copy, Debug, PartialEq, Eq)]
32#[repr(C, align(8))]
33pub struct Inlined {
34 size: u32,
35 data: [u8; BinaryView::MAX_INLINED_SIZE],
36}
37
38impl Inlined {
39 fn new<const N: usize>(value: &[u8]) -> Self {
40 let mut inlined = Self {
41 size: N.try_into().vortex_unwrap(),
42 data: [0u8; BinaryView::MAX_INLINED_SIZE],
43 };
44 inlined.data[..N].copy_from_slice(&value[..N]);
45 inlined
46 }
47
48 #[inline]
49 pub fn value(&self) -> &[u8] {
50 &self.data[0..(self.size as usize)]
51 }
52}
53
54#[derive(Clone, Copy, Debug)]
55#[repr(C, align(8))]
56pub struct Ref {
57 size: u32,
58 prefix: [u8; 4],
59 buffer_index: u32,
60 offset: u32,
61}
62
63impl Ref {
64 pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
65 Self {
66 size,
67 prefix,
68 buffer_index,
69 offset,
70 }
71 }
72
73 #[inline]
74 pub fn buffer_index(&self) -> u32 {
75 self.buffer_index
76 }
77
78 #[inline]
79 pub fn offset(&self) -> u32 {
80 self.offset
81 }
82
83 #[inline]
84 pub fn prefix(&self) -> &[u8; 4] {
85 &self.prefix
86 }
87
88 #[inline]
89 pub fn to_range(&self) -> Range<usize> {
90 self.offset as usize..(self.offset + self.size) as usize
91 }
92}
93
94#[derive(Clone, Copy)]
95#[repr(C, align(16))]
96pub union BinaryView {
97 le_bytes: [u8; 16],
100
101 inlined: Inlined,
103
104 _ref: Ref,
106}
107
108assert_eq_size!(BinaryView, [u8; 16]);
109assert_eq_size!(Inlined, [u8; 16]);
110assert_eq_size!(Ref, [u8; 16]);
111assert_eq_align!(BinaryView, u128);
112
113impl Hash for BinaryView {
114 fn hash<H: Hasher>(&self, state: &mut H) {
115 unsafe { std::mem::transmute::<&BinaryView, &[u8; 16]>(self) }.hash(state);
116 }
117}
118
119impl BinaryView {
120 pub const MAX_INLINED_SIZE: usize = 12;
121
122 #[inline(never)]
130 pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
131 match value.len() {
132 0 => Self {
133 inlined: Inlined::new::<0>(value),
134 },
135 1 => Self {
136 inlined: Inlined::new::<1>(value),
137 },
138 2 => Self {
139 inlined: Inlined::new::<2>(value),
140 },
141 3 => Self {
142 inlined: Inlined::new::<3>(value),
143 },
144 4 => Self {
145 inlined: Inlined::new::<4>(value),
146 },
147 5 => Self {
148 inlined: Inlined::new::<5>(value),
149 },
150 6 => Self {
151 inlined: Inlined::new::<6>(value),
152 },
153 7 => Self {
154 inlined: Inlined::new::<7>(value),
155 },
156 8 => Self {
157 inlined: Inlined::new::<8>(value),
158 },
159 9 => Self {
160 inlined: Inlined::new::<9>(value),
161 },
162 10 => Self {
163 inlined: Inlined::new::<10>(value),
164 },
165 11 => Self {
166 inlined: Inlined::new::<11>(value),
167 },
168 12 => Self {
169 inlined: Inlined::new::<12>(value),
170 },
171 _ => Self {
172 _ref: Ref::new(
173 u32::try_from(value.len()).vortex_unwrap(),
174 value[0..4].try_into().vortex_unwrap(),
175 block,
176 offset,
177 ),
178 },
179 }
180 }
181
182 #[inline]
184 pub fn empty_view() -> Self {
185 Self::new_inlined(&[])
186 }
187
188 #[inline]
190 pub fn new_inlined(value: &[u8]) -> Self {
191 assert!(
192 value.len() <= Self::MAX_INLINED_SIZE,
193 "expected inlined value to be <= 12 bytes, was {}",
194 value.len()
195 );
196
197 Self::make_view(value, 0, 0)
198 }
199
200 #[inline]
201 pub fn len(&self) -> u32 {
202 unsafe { self.inlined.size }
203 }
204
205 #[inline]
206 pub fn is_empty(&self) -> bool {
207 self.len() > 0
208 }
209
210 #[inline]
211 #[allow(clippy::cast_possible_truncation)]
212 pub fn is_inlined(&self) -> bool {
213 self.len() <= (Self::MAX_INLINED_SIZE as u32)
214 }
215
216 pub fn as_inlined(&self) -> &Inlined {
217 unsafe { &self.inlined }
218 }
219
220 pub fn as_view(&self) -> &Ref {
221 unsafe { &self._ref }
222 }
223
224 pub fn as_u128(&self) -> u128 {
225 unsafe { u128::from_le_bytes(self.le_bytes) }
227 }
228
229 #[inline(always)]
231 pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
232 if self.is_inlined() {
233 self
234 } else {
235 let view_ref = self.as_view();
237 Self {
238 _ref: Ref::new(
239 self.len(),
240 *view_ref.prefix(),
241 buffer_idx,
242 view_ref.offset(),
243 ),
244 }
245 }
246 }
247
248 #[inline(always)]
251 pub fn offset_view(self, offset: u32) -> Self {
252 if self.is_inlined() {
253 self
254 } else {
255 let view_ref = self.as_view();
257 Self {
258 _ref: Ref::new(
259 self.len(),
260 *view_ref.prefix(),
261 offset + view_ref.buffer_index(),
262 view_ref.offset(),
263 ),
264 }
265 }
266 }
267}
268
269impl From<u128> for BinaryView {
270 fn from(value: u128) -> Self {
271 BinaryView {
272 le_bytes: value.to_le_bytes(),
273 }
274 }
275}
276
277impl Debug for BinaryView {
278 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
279 let mut s = f.debug_struct("BinaryView");
280 if self.is_inlined() {
281 s.field("inline", &"i".to_string());
282 } else {
283 s.field("ref", &"r".to_string());
284 }
285 s.finish()
286 }
287}
288
289vtable!(VarBinView);
290
291impl VTable for VarBinViewVTable {
292 type Array = VarBinViewArray;
293 type Encoding = VarBinViewEncoding;
294
295 type ArrayVTable = Self;
296 type CanonicalVTable = Self;
297 type OperationsVTable = Self;
298 type ValidityVTable = ValidityVTableFromValidityHelper;
299 type VisitorVTable = Self;
300 type ComputeVTable = NotSupported;
301 type EncodeVTable = NotSupported;
302 type PipelineVTable = NotSupported;
303 type SerdeVTable = Self;
304
305 fn id(_encoding: &Self::Encoding) -> EncodingId {
306 EncodingId::new_ref("vortex.varbinview")
307 }
308
309 fn encoding(_array: &Self::Array) -> EncodingRef {
310 EncodingRef::new_ref(VarBinViewEncoding.as_ref())
311 }
312}
313
314#[derive(Clone, Debug)]
374pub struct VarBinViewArray {
375 dtype: DType,
376 buffers: Arc<[ByteBuffer]>,
377 views: Buffer<BinaryView>,
378 validity: Validity,
379 stats_set: ArrayStats,
380}
381
382#[derive(Clone, Debug)]
383pub struct VarBinViewEncoding;
384
385impl VarBinViewArray {
386 fn validate(
387 views: &Buffer<BinaryView>,
388 buffers: &Arc<[ByteBuffer]>,
389 dtype: &DType,
390 validity: &Validity,
391 ) -> VortexResult<()> {
392 vortex_ensure!(
393 validity.nullability() == dtype.nullability(),
394 "validity {:?} incompatible with nullability {:?}",
395 validity,
396 dtype.nullability()
397 );
398
399 match dtype {
400 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
401 simdutf8::basic::from_utf8(string).is_ok()
402 })?,
403 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
404 _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
405 }
406
407 Ok(())
408 }
409
410 fn validate_views<F>(
411 views: &Buffer<BinaryView>,
412 buffers: &Arc<[ByteBuffer]>,
413 validity: &Validity,
414 validator: F,
415 ) -> VortexResult<()>
416 where
417 F: Fn(&[u8]) -> bool,
418 {
419 for (idx, &view) in views.iter().enumerate() {
420 if validity.is_null(idx)? {
421 continue;
422 }
423
424 if view.is_inlined() {
425 let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
427 vortex_ensure!(
428 validator(bytes),
429 "view at index {idx}: inlined bytes failed utf-8 validation"
430 );
431 } else {
432 let view = view.as_view();
434 let buf_index = view.buffer_index as usize;
435 let start_offset = view.offset as usize;
436 let end_offset = start_offset.saturating_add(view.size as usize);
437
438 let buf = buffers.get(buf_index).ok_or_else(||
439 vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
440 buffers.len()))?;
441
442 vortex_ensure!(
443 start_offset < buf.len(),
444 "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
445 buf.len(),
446 );
447
448 vortex_ensure!(
449 end_offset <= buf.len(),
450 "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
451 buf.len(),
452 );
453
454 let bytes = &buf[start_offset..end_offset];
456 vortex_ensure!(
457 view.prefix == bytes[..4],
458 "VarBinView prefix does not match full string"
459 );
460
461 vortex_ensure!(
463 validator(bytes),
464 "view at index {idx}: outlined bytes fails utf-8 validation"
465 );
466 }
467 }
468
469 Ok(())
470 }
471}
472
473impl VarBinViewArray {
474 pub unsafe fn new_unchecked(
482 views: Buffer<BinaryView>,
483 buffers: Arc<[ByteBuffer]>,
484 dtype: DType,
485 validity: Validity,
486 ) -> Self {
487 Self {
488 dtype,
489 buffers,
490 views,
491 validity,
492 stats_set: Default::default(),
493 }
494 }
495
496 pub fn new(
497 views: Buffer<BinaryView>,
498 buffers: Arc<[ByteBuffer]>,
499 dtype: DType,
500 validity: Validity,
501 ) -> Self {
502 Self::try_new(views, buffers, dtype, validity).vortex_expect("VarBinViewArray new")
503 }
504
505 pub fn try_new(
506 views: Buffer<BinaryView>,
507 buffers: Arc<[ByteBuffer]>,
508 dtype: DType,
509 validity: Validity,
510 ) -> VortexResult<Self> {
511 Self::validate(&views, &buffers, &dtype, &validity)?;
512
513 Ok(Self {
514 dtype,
515 buffers,
516 views,
517 validity,
518 stats_set: Default::default(),
519 })
520 }
521
522 pub fn nbuffers(&self) -> usize {
524 self.buffers.len()
525 }
526
527 #[inline]
533 pub fn views(&self) -> &Buffer<BinaryView> {
534 &self.views
535 }
536
537 #[inline]
541 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
542 let views = self.views();
543 let view = &views[index];
544 if !view.is_inlined() {
546 let view_ref = view.as_view();
547 self.buffer(view_ref.buffer_index() as usize)
548 .slice(view_ref.to_range())
549 } else {
550 views
552 .clone()
553 .into_byte_buffer()
554 .slice_ref(view.as_inlined().value())
555 }
556 }
557
558 #[inline]
565 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
566 if idx >= self.nbuffers() {
567 vortex_panic!(
568 "{idx} buffer index out of bounds, there are {} buffers",
569 self.nbuffers()
570 );
571 }
572 &self.buffers[idx]
573 }
574
575 #[inline]
577 pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
578 &self.buffers
579 }
580
581 #[allow(clippy::same_name_method)]
583 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
584 iter: I,
585 dtype: DType,
586 ) -> Self {
587 let iter = iter.into_iter();
588 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
589
590 for item in iter {
591 match item {
592 None => builder.append_null(),
593 Some(v) => builder.append_value(v),
594 }
595 }
596
597 builder.finish_into_varbinview()
598 }
599
600 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
601 let iter = iter.into_iter();
602 let mut builder = VarBinViewBuilder::with_capacity(
603 DType::Utf8(Nullability::NonNullable),
604 iter.size_hint().0,
605 );
606
607 for item in iter {
608 builder.append_value(item.as_ref());
609 }
610
611 builder.finish_into_varbinview()
612 }
613
614 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
615 iter: I,
616 ) -> Self {
617 let iter = iter.into_iter();
618 let mut builder = VarBinViewBuilder::with_capacity(
619 DType::Utf8(Nullability::Nullable),
620 iter.size_hint().0,
621 );
622
623 for item in iter {
624 match item {
625 None => builder.append_null(),
626 Some(v) => builder.append_value(v.as_ref()),
627 }
628 }
629
630 builder.finish_into_varbinview()
631 }
632
633 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
634 let iter = iter.into_iter();
635 let mut builder = VarBinViewBuilder::with_capacity(
636 DType::Binary(Nullability::NonNullable),
637 iter.size_hint().0,
638 );
639
640 for item in iter {
641 builder.append_value(item.as_ref());
642 }
643
644 builder.finish_into_varbinview()
645 }
646
647 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
648 iter: I,
649 ) -> Self {
650 let iter = iter.into_iter();
651 let mut builder = VarBinViewBuilder::with_capacity(
652 DType::Binary(Nullability::Nullable),
653 iter.size_hint().0,
654 );
655
656 for item in iter {
657 match item {
658 None => builder.append_null(),
659 Some(v) => builder.append_value(v.as_ref()),
660 }
661 }
662
663 builder.finish_into_varbinview()
664 }
665}
666
667impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
668 fn len(array: &VarBinViewArray) -> usize {
669 array.views.len()
670 }
671
672 fn dtype(array: &VarBinViewArray) -> &DType {
673 &array.dtype
674 }
675
676 fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
677 array.stats_set.to_ref(array.as_ref())
678 }
679}
680
681impl ValidityHelper for VarBinViewArray {
682 fn validity(&self) -> &Validity {
683 &self.validity
684 }
685}
686
687impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
688 fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
689 Ok(Canonical::VarBinView(array.clone()))
690 }
691
692 fn append_to_builder(
693 array: &VarBinViewArray,
694 builder: &mut dyn ArrayBuilder,
695 ) -> VortexResult<()> {
696 builder.extend_from_array(array.as_ref())
697 }
698}
699
700impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
701 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
702 Self::from_iter_nullable_bin(iter)
703 }
704}
705
706impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
707 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
708 Self::from_iter_nullable_bin(iter)
709 }
710}
711
712impl FromIterator<Option<String>> for VarBinViewArray {
713 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
714 Self::from_iter_nullable_str(iter)
715 }
716}
717
718impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
719 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
720 Self::from_iter_nullable_str(iter)
721 }
722}
723
724#[cfg(test)]
725mod test {
726 use vortex_scalar::Scalar;
727
728 use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
729 use crate::{Array, Canonical, IntoArray};
730
731 #[test]
732 pub fn varbin_view() {
733 let binary_arr =
734 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
735 assert_eq!(binary_arr.len(), 2);
736 assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world"));
737 assert_eq!(
738 binary_arr.scalar_at(1),
739 Scalar::from("hello world this is a long string")
740 );
741 }
742
743 #[test]
744 pub fn slice_array() {
745 let binary_arr =
746 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
747 .slice(1, 2);
748 assert_eq!(
749 binary_arr.scalar_at(0),
750 Scalar::from("hello world this is a long string")
751 );
752 }
753
754 #[test]
755 pub fn flatten_array() {
756 let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
757
758 let flattened = binary_arr.to_canonical().unwrap();
759 assert!(matches!(flattened, Canonical::VarBinView(_)));
760
761 let var_bin = flattened.into_varbinview().unwrap().into_array();
762 assert_eq!(var_bin.scalar_at(0), Scalar::from("string1"));
763 assert_eq!(var_bin.scalar_at(1), Scalar::from("string2"));
764 }
765
766 #[test]
767 pub fn binary_view_size_and_alignment() {
768 assert_eq!(size_of::<BinaryView>(), 16);
769 assert_eq!(align_of::<BinaryView>(), 16);
770 }
771}