1use std::fmt::{Debug, Formatter};
5use std::ops::Range;
6use std::sync::Arc;
7
8use static_assertions::{assert_eq_align, assert_eq_size};
9use vortex_buffer::{Buffer, ByteBuffer};
10use vortex_dtype::{DType, Nullability};
11use vortex_error::{
12 VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
13};
14
15use crate::builders::{ArrayBuilder, VarBinViewBuilder};
16use crate::stats::{ArrayStats, StatsSetRef};
17use crate::validity::Validity;
18use crate::vtable::{
19 ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
20 ValidityVTableFromValidityHelper,
21};
22use crate::{Canonical, EncodingId, EncodingRef, vtable};
23
24mod accessor;
25mod compact;
26mod compute;
27mod ops;
28mod serde;
29
30#[derive(Clone, Copy, Debug, PartialEq, Eq)]
31#[repr(C, align(8))]
32pub struct Inlined {
33 size: u32,
34 data: [u8; BinaryView::MAX_INLINED_SIZE],
35}
36
37impl Inlined {
38 fn new<const N: usize>(value: &[u8]) -> Self {
39 let mut inlined = Self {
40 size: N.try_into().vortex_unwrap(),
41 data: [0u8; BinaryView::MAX_INLINED_SIZE],
42 };
43 inlined.data[..N].copy_from_slice(&value[..N]);
44 inlined
45 }
46
47 #[inline]
48 pub fn value(&self) -> &[u8] {
49 &self.data[0..(self.size as usize)]
50 }
51}
52
53#[derive(Clone, Copy, Debug)]
54#[repr(C, align(8))]
55pub struct Ref {
56 size: u32,
57 prefix: [u8; 4],
58 buffer_index: u32,
59 offset: u32,
60}
61
62impl Ref {
63 pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
64 Self {
65 size,
66 prefix,
67 buffer_index,
68 offset,
69 }
70 }
71
72 #[inline]
73 pub fn buffer_index(&self) -> u32 {
74 self.buffer_index
75 }
76
77 #[inline]
78 pub fn offset(&self) -> u32 {
79 self.offset
80 }
81
82 #[inline]
83 pub fn prefix(&self) -> &[u8; 4] {
84 &self.prefix
85 }
86
87 #[inline]
88 pub fn to_range(&self) -> Range<usize> {
89 self.offset as usize..(self.offset + self.size) as usize
90 }
91}
92
93#[derive(Clone, Copy)]
94#[repr(C, align(16))]
95pub union BinaryView {
96 le_bytes: [u8; 16],
99
100 inlined: Inlined,
102
103 _ref: Ref,
105}
106
107assert_eq_size!(BinaryView, [u8; 16]);
108assert_eq_size!(Inlined, [u8; 16]);
109assert_eq_size!(Ref, [u8; 16]);
110assert_eq_align!(BinaryView, u128);
111
112impl BinaryView {
113 pub const MAX_INLINED_SIZE: usize = 12;
114
115 #[inline(never)]
123 pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
124 match value.len() {
125 0 => Self {
126 inlined: Inlined::new::<0>(value),
127 },
128 1 => Self {
129 inlined: Inlined::new::<1>(value),
130 },
131 2 => Self {
132 inlined: Inlined::new::<2>(value),
133 },
134 3 => Self {
135 inlined: Inlined::new::<3>(value),
136 },
137 4 => Self {
138 inlined: Inlined::new::<4>(value),
139 },
140 5 => Self {
141 inlined: Inlined::new::<5>(value),
142 },
143 6 => Self {
144 inlined: Inlined::new::<6>(value),
145 },
146 7 => Self {
147 inlined: Inlined::new::<7>(value),
148 },
149 8 => Self {
150 inlined: Inlined::new::<8>(value),
151 },
152 9 => Self {
153 inlined: Inlined::new::<9>(value),
154 },
155 10 => Self {
156 inlined: Inlined::new::<10>(value),
157 },
158 11 => Self {
159 inlined: Inlined::new::<11>(value),
160 },
161 12 => Self {
162 inlined: Inlined::new::<12>(value),
163 },
164 _ => Self {
165 _ref: Ref::new(
166 u32::try_from(value.len()).vortex_unwrap(),
167 value[0..4].try_into().vortex_unwrap(),
168 block,
169 offset,
170 ),
171 },
172 }
173 }
174
175 #[inline]
177 pub fn empty_view() -> Self {
178 Self::new_inlined(&[])
179 }
180
181 #[inline]
183 pub fn new_inlined(value: &[u8]) -> Self {
184 assert!(
185 value.len() <= Self::MAX_INLINED_SIZE,
186 "expected inlined value to be <= 12 bytes, was {}",
187 value.len()
188 );
189
190 Self::make_view(value, 0, 0)
191 }
192
193 #[inline]
194 pub fn len(&self) -> u32 {
195 unsafe { self.inlined.size }
196 }
197
198 #[inline]
199 pub fn is_empty(&self) -> bool {
200 self.len() > 0
201 }
202
203 #[inline]
204 #[allow(clippy::cast_possible_truncation)]
205 pub fn is_inlined(&self) -> bool {
206 self.len() <= (Self::MAX_INLINED_SIZE as u32)
207 }
208
209 pub fn as_inlined(&self) -> &Inlined {
210 unsafe { &self.inlined }
211 }
212
213 pub fn as_view(&self) -> &Ref {
214 unsafe { &self._ref }
215 }
216
217 pub fn as_u128(&self) -> u128 {
218 unsafe { u128::from_le_bytes(self.le_bytes) }
220 }
221
222 #[inline(always)]
224 pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
225 if self.is_inlined() {
226 self
227 } else {
228 let view_ref = self.as_view();
230 Self {
231 _ref: Ref::new(
232 self.len(),
233 *view_ref.prefix(),
234 buffer_idx,
235 view_ref.offset(),
236 ),
237 }
238 }
239 }
240
241 #[inline(always)]
244 pub fn offset_view(self, offset: u32) -> Self {
245 if self.is_inlined() {
246 self
247 } else {
248 let view_ref = self.as_view();
250 Self {
251 _ref: Ref::new(
252 self.len(),
253 *view_ref.prefix(),
254 offset + view_ref.buffer_index(),
255 view_ref.offset(),
256 ),
257 }
258 }
259 }
260}
261
262impl From<u128> for BinaryView {
263 fn from(value: u128) -> Self {
264 BinaryView {
265 le_bytes: value.to_le_bytes(),
266 }
267 }
268}
269
270impl Debug for BinaryView {
271 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
272 let mut s = f.debug_struct("BinaryView");
273 if self.is_inlined() {
274 s.field("inline", &"i".to_string());
275 } else {
276 s.field("ref", &"r".to_string());
277 }
278 s.finish()
279 }
280}
281
282vtable!(VarBinView);
283
284impl VTable for VarBinViewVTable {
285 type Array = VarBinViewArray;
286 type Encoding = VarBinViewEncoding;
287
288 type ArrayVTable = Self;
289 type CanonicalVTable = Self;
290 type OperationsVTable = Self;
291 type ValidityVTable = ValidityVTableFromValidityHelper;
292 type VisitorVTable = Self;
293 type ComputeVTable = NotSupported;
294 type EncodeVTable = NotSupported;
295 type SerdeVTable = Self;
296
297 fn id(_encoding: &Self::Encoding) -> EncodingId {
298 EncodingId::new_ref("vortex.varbinview")
299 }
300
301 fn encoding(_array: &Self::Array) -> EncodingRef {
302 EncodingRef::new_ref(VarBinViewEncoding.as_ref())
303 }
304}
305
306#[derive(Clone, Debug)]
366pub struct VarBinViewArray {
367 dtype: DType,
368 buffers: Arc<[ByteBuffer]>,
369 views: Buffer<BinaryView>,
370 validity: Validity,
371 stats_set: ArrayStats,
372}
373
374#[derive(Clone, Debug)]
375pub struct VarBinViewEncoding;
376
377impl VarBinViewArray {
378 fn validate(
379 views: &Buffer<BinaryView>,
380 buffers: &Arc<[ByteBuffer]>,
381 dtype: &DType,
382 validity: &Validity,
383 ) -> VortexResult<()> {
384 vortex_ensure!(
385 validity.nullability() == dtype.nullability(),
386 "validity {:?} incompatible with nullability {:?}",
387 validity,
388 dtype.nullability()
389 );
390
391 match dtype {
392 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
393 std::str::from_utf8(string).is_ok()
394 })?,
395 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
396 _ => vortex_bail!("invalid DType {dtype}"),
397 }
398
399 Ok(())
400 }
401
402 fn validate_views<F>(
403 views: &Buffer<BinaryView>,
404 buffers: &Arc<[ByteBuffer]>,
405 validity: &Validity,
406 validator: F,
407 ) -> VortexResult<()>
408 where
409 F: Fn(&[u8]) -> bool,
410 {
411 for (idx, &view) in views.iter().enumerate() {
412 if validity.is_null(idx)? {
413 continue;
414 }
415
416 if view.is_inlined() {
417 let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
419 vortex_ensure!(
420 validator(bytes),
421 "view at index {idx}: inlined bytes failed utf-8 validation"
422 );
423 } else {
424 let view = view.as_view();
426 let buf_index = view.buffer_index as usize;
427 let start_offset = view.offset as usize;
428 let end_offset = start_offset.saturating_add(view.size as usize);
429
430 let buf = buffers.get(buf_index).ok_or_else(||
431 vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
432 buffers.len()))?;
433
434 vortex_ensure!(
435 start_offset < buf.len(),
436 "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
437 buf.len(),
438 );
439
440 vortex_ensure!(
441 end_offset <= buf.len(),
442 "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
443 buf.len(),
444 );
445
446 let bytes = &buf[start_offset..end_offset];
448 vortex_ensure!(
449 view.prefix == bytes[..4],
450 "VarBinView prefix does not match full string"
451 );
452
453 vortex_ensure!(
455 validator(bytes),
456 "view at index {idx}: outlined bytes fails utf-8 validation"
457 );
458 }
459 }
460
461 Ok(())
462 }
463}
464
465impl VarBinViewArray {
466 pub unsafe fn new_unchecked(
474 views: Buffer<BinaryView>,
475 buffers: Arc<[ByteBuffer]>,
476 dtype: DType,
477 validity: Validity,
478 ) -> Self {
479 Self {
480 dtype,
481 buffers,
482 views,
483 validity,
484 stats_set: Default::default(),
485 }
486 }
487
488 pub fn new(
489 views: Buffer<BinaryView>,
490 buffers: Arc<[ByteBuffer]>,
491 dtype: DType,
492 validity: Validity,
493 ) -> Self {
494 Self::try_new(views, buffers, dtype, validity).vortex_expect("VarBinViewArray new")
495 }
496
497 pub fn try_new(
498 views: Buffer<BinaryView>,
499 buffers: Arc<[ByteBuffer]>,
500 dtype: DType,
501 validity: Validity,
502 ) -> VortexResult<Self> {
503 Self::validate(&views, &buffers, &dtype, &validity)?;
504
505 Ok(Self {
506 dtype,
507 buffers,
508 views,
509 validity,
510 stats_set: Default::default(),
511 })
512 }
513
514 pub fn nbuffers(&self) -> usize {
516 self.buffers.len()
517 }
518
519 #[inline]
525 pub fn views(&self) -> &Buffer<BinaryView> {
526 &self.views
527 }
528
529 #[inline]
533 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
534 let views = self.views();
535 let view = &views[index];
536 if !view.is_inlined() {
538 let view_ref = view.as_view();
539 self.buffer(view_ref.buffer_index() as usize)
540 .slice(view_ref.to_range())
541 } else {
542 views
544 .clone()
545 .into_byte_buffer()
546 .slice_ref(view.as_inlined().value())
547 }
548 }
549
550 #[inline]
557 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
558 if idx >= self.nbuffers() {
559 vortex_panic!(
560 "{idx} buffer index out of bounds, there are {} buffers",
561 self.nbuffers()
562 );
563 }
564 &self.buffers[idx]
565 }
566
567 #[inline]
569 pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
570 &self.buffers
571 }
572
573 #[allow(clippy::same_name_method)]
575 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
576 iter: I,
577 dtype: DType,
578 ) -> Self {
579 let iter = iter.into_iter();
580 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
581
582 for item in iter {
583 match item {
584 None => builder.append_null(),
585 Some(v) => builder.append_value(v),
586 }
587 }
588
589 builder.finish_into_varbinview()
590 }
591
592 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
593 let iter = iter.into_iter();
594 let mut builder = VarBinViewBuilder::with_capacity(
595 DType::Utf8(Nullability::NonNullable),
596 iter.size_hint().0,
597 );
598
599 for item in iter {
600 builder.append_value(item.as_ref());
601 }
602
603 builder.finish_into_varbinview()
604 }
605
606 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
607 iter: I,
608 ) -> Self {
609 let iter = iter.into_iter();
610 let mut builder = VarBinViewBuilder::with_capacity(
611 DType::Utf8(Nullability::Nullable),
612 iter.size_hint().0,
613 );
614
615 for item in iter {
616 match item {
617 None => builder.append_null(),
618 Some(v) => builder.append_value(v.as_ref()),
619 }
620 }
621
622 builder.finish_into_varbinview()
623 }
624
625 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
626 let iter = iter.into_iter();
627 let mut builder = VarBinViewBuilder::with_capacity(
628 DType::Binary(Nullability::NonNullable),
629 iter.size_hint().0,
630 );
631
632 for item in iter {
633 builder.append_value(item.as_ref());
634 }
635
636 builder.finish_into_varbinview()
637 }
638
639 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
640 iter: I,
641 ) -> Self {
642 let iter = iter.into_iter();
643 let mut builder = VarBinViewBuilder::with_capacity(
644 DType::Binary(Nullability::Nullable),
645 iter.size_hint().0,
646 );
647
648 for item in iter {
649 match item {
650 None => builder.append_null(),
651 Some(v) => builder.append_value(v.as_ref()),
652 }
653 }
654
655 builder.finish_into_varbinview()
656 }
657}
658
659impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
660 fn len(array: &VarBinViewArray) -> usize {
661 array.views.len()
662 }
663
664 fn dtype(array: &VarBinViewArray) -> &DType {
665 &array.dtype
666 }
667
668 fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
669 array.stats_set.to_ref(array.as_ref())
670 }
671}
672
673impl ValidityHelper for VarBinViewArray {
674 fn validity(&self) -> &Validity {
675 &self.validity
676 }
677}
678
679impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
680 fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
681 Ok(Canonical::VarBinView(array.clone()))
682 }
683
684 fn append_to_builder(
685 array: &VarBinViewArray,
686 builder: &mut dyn ArrayBuilder,
687 ) -> VortexResult<()> {
688 builder.extend_from_array(array.as_ref())
689 }
690}
691
692impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
693 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
694 Self::from_iter_nullable_bin(iter)
695 }
696}
697
698impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
699 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
700 Self::from_iter_nullable_bin(iter)
701 }
702}
703
704impl FromIterator<Option<String>> for VarBinViewArray {
705 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
706 Self::from_iter_nullable_str(iter)
707 }
708}
709
710impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
711 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
712 Self::from_iter_nullable_str(iter)
713 }
714}
715
716#[cfg(test)]
717mod test {
718 use vortex_scalar::Scalar;
719
720 use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
721 use crate::{Array, Canonical, IntoArray};
722
723 #[test]
724 pub fn varbin_view() {
725 let binary_arr =
726 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
727 assert_eq!(binary_arr.len(), 2);
728 assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world"));
729 assert_eq!(
730 binary_arr.scalar_at(1),
731 Scalar::from("hello world this is a long string")
732 );
733 }
734
735 #[test]
736 pub fn slice_array() {
737 let binary_arr =
738 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
739 .slice(1, 2);
740 assert_eq!(
741 binary_arr.scalar_at(0),
742 Scalar::from("hello world this is a long string")
743 );
744 }
745
746 #[test]
747 pub fn flatten_array() {
748 let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
749
750 let flattened = binary_arr.to_canonical().unwrap();
751 assert!(matches!(flattened, Canonical::VarBinView(_)));
752
753 let var_bin = flattened.into_varbinview().unwrap().into_array();
754 assert_eq!(var_bin.scalar_at(0), Scalar::from("string1"));
755 assert_eq!(var_bin.scalar_at(1), Scalar::from("string2"));
756 }
757
758 #[test]
759 pub fn binary_view_size_and_alignment() {
760 assert_eq!(size_of::<BinaryView>(), 16);
761 assert_eq!(align_of::<BinaryView>(), 16);
762 }
763}