1use std::fmt::{Debug, Formatter};
2use std::ops::Range;
3use std::sync::Arc;
4
5use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder};
6use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType};
7use arrow_array::{
8 ArrayRef as ArrowArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArray,
9};
10use arrow_buffer::ScalarBuffer;
11use static_assertions::{assert_eq_align, assert_eq_size};
12use vortex_buffer::{Alignment, Buffer, ByteBuffer};
13use vortex_dtype::DType;
14use vortex_error::{
15 VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_err, vortex_panic,
16};
17use vortex_mask::Mask;
18
19use crate::array::{ArrayCanonicalImpl, ArrayValidityImpl};
20use crate::arrow::FromArrowArray;
21use crate::builders::ArrayBuilder;
22use crate::stats::{ArrayStats, StatsSetRef};
23use crate::validity::Validity;
24use crate::vtable::VTableRef;
25use crate::{
26 Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, Canonical, EmptyMetadata, Encoding,
27 TryFromArrayRef, try_from_array_ref,
28};
29
30mod accessor;
31mod compute;
32mod serde;
33mod variants;
34
35#[derive(Clone, Copy, Debug, PartialEq, Eq)]
36#[repr(C, align(8))]
37pub struct Inlined {
38 size: u32,
39 data: [u8; BinaryView::MAX_INLINED_SIZE],
40}
41
42impl Inlined {
43 fn new<const N: usize>(value: &[u8]) -> Self {
44 let mut inlined = Self {
45 size: N.try_into().vortex_unwrap(),
46 data: [0u8; BinaryView::MAX_INLINED_SIZE],
47 };
48 inlined.data[..N].copy_from_slice(&value[..N]);
49 inlined
50 }
51
52 #[inline]
53 pub fn value(&self) -> &[u8] {
54 &self.data[0..(self.size as usize)]
55 }
56}
57
58#[derive(Clone, Copy, Debug)]
59#[repr(C, align(8))]
60pub struct Ref {
61 size: u32,
62 prefix: [u8; 4],
63 buffer_index: u32,
64 offset: u32,
65}
66
67impl Ref {
68 pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
69 Self {
70 size,
71 prefix,
72 buffer_index,
73 offset,
74 }
75 }
76
77 #[inline]
78 pub fn buffer_index(&self) -> u32 {
79 self.buffer_index
80 }
81
82 #[inline]
83 pub fn offset(&self) -> u32 {
84 self.offset
85 }
86
87 #[inline]
88 pub fn prefix(&self) -> &[u8; 4] {
89 &self.prefix
90 }
91
92 #[inline]
93 pub fn to_range(&self) -> Range<usize> {
94 self.offset as usize..(self.offset + self.size) as usize
95 }
96}
97
98#[derive(Clone, Copy)]
99#[repr(C, align(16))]
100pub union BinaryView {
101 le_bytes: [u8; 16],
104
105 inlined: Inlined,
107
108 _ref: Ref,
110}
111
112assert_eq_size!(BinaryView, [u8; 16]);
113assert_eq_size!(Inlined, [u8; 16]);
114assert_eq_size!(Ref, [u8; 16]);
115assert_eq_align!(BinaryView, u128);
116
117impl BinaryView {
118 pub const MAX_INLINED_SIZE: usize = 12;
119
120 #[inline(never)]
128 pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
129 match value.len() {
130 0 => Self {
131 inlined: Inlined::new::<0>(value),
132 },
133 1 => Self {
134 inlined: Inlined::new::<1>(value),
135 },
136 2 => Self {
137 inlined: Inlined::new::<2>(value),
138 },
139 3 => Self {
140 inlined: Inlined::new::<3>(value),
141 },
142 4 => Self {
143 inlined: Inlined::new::<4>(value),
144 },
145 5 => Self {
146 inlined: Inlined::new::<5>(value),
147 },
148 6 => Self {
149 inlined: Inlined::new::<6>(value),
150 },
151 7 => Self {
152 inlined: Inlined::new::<7>(value),
153 },
154 8 => Self {
155 inlined: Inlined::new::<8>(value),
156 },
157 9 => Self {
158 inlined: Inlined::new::<9>(value),
159 },
160 10 => Self {
161 inlined: Inlined::new::<10>(value),
162 },
163 11 => Self {
164 inlined: Inlined::new::<11>(value),
165 },
166 12 => Self {
167 inlined: Inlined::new::<12>(value),
168 },
169 _ => Self {
170 _ref: Ref::new(
171 u32::try_from(value.len()).vortex_unwrap(),
172 value[0..4].try_into().vortex_unwrap(),
173 block,
174 offset,
175 ),
176 },
177 }
178 }
179
180 #[inline]
182 pub fn empty_view() -> Self {
183 Self::new_inlined(&[])
184 }
185
186 #[inline]
188 pub fn new_inlined(value: &[u8]) -> Self {
189 assert!(
190 value.len() <= Self::MAX_INLINED_SIZE,
191 "expected inlined value to be <= 12 bytes, was {}",
192 value.len()
193 );
194
195 Self::make_view(value, 0, 0)
196 }
197
198 #[inline]
199 pub fn len(&self) -> u32 {
200 unsafe { self.inlined.size }
201 }
202
203 #[inline]
204 pub fn is_empty(&self) -> bool {
205 self.len() > 0
206 }
207
208 #[inline]
209 #[allow(clippy::cast_possible_truncation)]
210 pub fn is_inlined(&self) -> bool {
211 self.len() <= (Self::MAX_INLINED_SIZE as u32)
212 }
213
214 pub fn as_inlined(&self) -> &Inlined {
215 unsafe { &self.inlined }
216 }
217
218 pub fn as_view(&self) -> &Ref {
219 unsafe { &self._ref }
220 }
221
222 pub fn as_u128(&self) -> u128 {
223 unsafe { u128::from_le_bytes(self.le_bytes) }
225 }
226
227 #[inline(always)]
230 pub fn offset_view(self, offset: u32) -> Self {
231 if self.is_inlined() {
232 self
233 } else {
234 let view_ref = self.as_view();
236 Self {
237 _ref: Ref::new(
238 self.len(),
239 *view_ref.prefix(),
240 offset + view_ref.buffer_index(),
241 view_ref.offset(),
242 ),
243 }
244 }
245 }
246}
247
248impl From<u128> for BinaryView {
249 fn from(value: u128) -> Self {
250 BinaryView {
251 le_bytes: value.to_le_bytes(),
252 }
253 }
254}
255
256impl Debug for BinaryView {
257 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
258 let mut s = f.debug_struct("BinaryView");
259 if self.is_inlined() {
260 s.field("inline", &"i".to_string());
261 } else {
262 s.field("ref", &"r".to_string());
263 }
264 s.finish()
265 }
266}
267
268#[derive(Clone, Debug)]
269pub struct VarBinViewArray {
270 dtype: DType,
271 buffers: Vec<ByteBuffer>,
272 views: Buffer<BinaryView>,
273 validity: Validity,
274 stats_set: ArrayStats,
275}
276
277try_from_array_ref!(VarBinViewArray);
278
279#[derive(Debug)]
280pub struct VarBinViewEncoding;
281impl Encoding for VarBinViewEncoding {
282 type Array = VarBinViewArray;
283 type Metadata = EmptyMetadata;
284}
285
286impl VarBinViewArray {
287 pub fn try_new(
288 views: Buffer<BinaryView>,
289 buffers: Vec<ByteBuffer>,
290 dtype: DType,
291 validity: Validity,
292 ) -> VortexResult<Self> {
293 if views.alignment() != Alignment::of::<BinaryView>() {
294 vortex_bail!("Views must be aligned to a 128 bits");
295 }
296
297 if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
298 vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
299 }
300
301 if dtype.is_nullable() == (validity == Validity::NonNullable) {
302 vortex_bail!("incorrect validity {:?}", validity);
303 }
304
305 Ok(Self {
306 dtype,
307 buffers,
308 views,
309 validity,
310 stats_set: Default::default(),
311 })
312 }
313
314 pub fn nbuffers(&self) -> usize {
316 self.buffers.len()
317 }
318
319 #[inline]
325 pub fn views(&self) -> &Buffer<BinaryView> {
326 &self.views
327 }
328
329 #[inline]
333 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
334 let views = self.views();
335 let view = &views[index];
336 if !view.is_inlined() {
338 let view_ref = view.as_view();
339 self.buffer(view_ref.buffer_index() as usize)
340 .slice(view_ref.to_range())
341 } else {
342 views
344 .clone()
345 .into_byte_buffer()
346 .slice_ref(view.as_inlined().value())
347 }
348 }
349
350 #[inline]
357 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
358 if idx >= self.nbuffers() {
359 vortex_panic!(
360 "{idx} buffer index out of bounds, there are {} buffers",
361 self.nbuffers()
362 );
363 }
364 &self.buffers[idx]
365 }
366
367 #[inline]
369 pub fn buffers(&self) -> &[ByteBuffer] {
370 &self.buffers
371 }
372
373 pub fn validity(&self) -> &Validity {
375 &self.validity
376 }
377
378 #[allow(clippy::same_name_method)]
380 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
381 iter: I,
382 dtype: DType,
383 ) -> Self {
384 match dtype {
385 DType::Utf8(nullability) => {
386 let string_view_array = generic_byte_view_builder::<StringViewType, _, _>(
387 iter.into_iter(),
388 |builder, v| {
389 match v {
390 None => builder.append_null(),
391 Some(inner) => {
392 let utf8 = unsafe { std::str::from_utf8_unchecked(inner.as_ref()) };
394 builder.append_value(utf8);
395 }
396 }
397 },
398 );
399 VarBinViewArray::try_from_array(ArrayRef::from_arrow(
400 &string_view_array,
401 nullability.into(),
402 ))
403 .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
404 .vortex_expect("StringViewArray to VarBinViewArray downcast")
405 }
406 DType::Binary(nullability) => {
407 let binary_view_array = generic_byte_view_builder::<BinaryViewType, _, _>(
408 iter.into_iter(),
409 GenericByteViewBuilder::append_option,
410 );
411 VarBinViewArray::try_from_array(ArrayRef::from_arrow(
412 &binary_view_array,
413 nullability.into(),
414 ))
415 .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
416 .vortex_expect("BinaryViewArray to VarBinViewArray downcast")
417 }
418 other => vortex_panic!("VarBinViewArray must be Utf8 or Binary, was {other}"),
419 }
420 }
421
422 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
423 let iter = iter.into_iter();
424 let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
425 for s in iter {
426 builder.append_value(s);
427 }
428 let array = ArrayRef::from_arrow(&builder.finish(), false);
429 VarBinViewArray::try_from_array(array)
430 .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
431 .vortex_expect("VarBinViewArray from StringViewBuilder")
432 }
433
434 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
435 iter: I,
436 ) -> Self {
437 let iter = iter.into_iter();
438 let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
439 builder.extend(iter);
440
441 let array = ArrayRef::from_arrow(&builder.finish(), true);
442 VarBinViewArray::try_from_array(array)
443 .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
444 .vortex_expect("VarBinViewArray from StringViewBuilder")
445 }
446
447 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
448 let iter = iter.into_iter();
449 let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
450 for b in iter {
451 builder.append_value(b);
452 }
453 let array = ArrayRef::from_arrow(&builder.finish(), false);
454 VarBinViewArray::try_from_array(array)
455 .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
456 .vortex_expect("VarBinViewArray from StringViewBuilder")
457 }
458
459 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
460 iter: I,
461 ) -> Self {
462 let iter = iter.into_iter();
463 let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
464 builder.extend(iter);
465 let array = ArrayRef::from_arrow(&builder.finish(), true);
466 VarBinViewArray::try_from_array(array)
467 .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
468 .vortex_expect("VarBinViewArray from StringViewBuilder")
469 }
470}
471
472fn generic_byte_view_builder<B, V, F>(
474 values: impl Iterator<Item = Option<V>>,
475 mut append_fn: F,
476) -> GenericByteViewArray<B>
477where
478 B: ByteViewType,
479 V: AsRef<[u8]>,
480 F: FnMut(&mut GenericByteViewBuilder<B>, Option<V>),
481{
482 let mut builder = GenericByteViewBuilder::<B>::new();
483
484 for value in values {
485 append_fn(&mut builder, value);
486 }
487
488 builder.finish()
489}
490
491impl ArrayImpl for VarBinViewArray {
492 type Encoding = VarBinViewEncoding;
493
494 fn _len(&self) -> usize {
495 self.views.len()
496 }
497
498 fn _dtype(&self) -> &DType {
499 &self.dtype
500 }
501
502 fn _vtable(&self) -> VTableRef {
503 VTableRef::new_ref(&VarBinViewEncoding)
504 }
505
506 fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
507 let mut this = self.clone();
508
509 if let Validity::Array(array) = &mut this.validity {
510 *array = children[0].clone();
511 }
512
513 Ok(this)
514 }
515}
516
517impl ArrayStatisticsImpl for VarBinViewArray {
518 fn _stats_ref(&self) -> StatsSetRef<'_> {
519 self.stats_set.to_ref(self)
520 }
521}
522
523impl ArrayCanonicalImpl for VarBinViewArray {
524 fn _to_canonical(&self) -> VortexResult<Canonical> {
525 Ok(Canonical::VarBinView(self.clone()))
526 }
527
528 fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
529 builder.extend_from_array(self)
530 }
531}
532
533pub(crate) fn varbinview_as_arrow(var_bin_view: &VarBinViewArray) -> ArrowArrayRef {
534 let views = var_bin_view.views().clone();
535
536 let nulls = var_bin_view
537 .validity_mask()
538 .vortex_expect("VarBinViewArray: failed to get logical validity")
539 .to_null_buffer();
540
541 let data = (0..var_bin_view.nbuffers())
542 .map(|i| var_bin_view.buffer(i))
543 .collect::<Vec<_>>();
544
545 let data = data
546 .into_iter()
547 .map(|p| p.clone().into_arrow_buffer())
548 .collect::<Vec<_>>();
549
550 match var_bin_view.dtype() {
552 DType::Binary(_) => Arc::new(unsafe {
553 BinaryViewArray::new_unchecked(
554 ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
555 data,
556 nulls,
557 )
558 }),
559 DType::Utf8(_) => Arc::new(unsafe {
560 StringViewArray::new_unchecked(
561 ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
562 data,
563 nulls,
564 )
565 }),
566 _ => vortex_panic!("expected utf8 or binary, got {}", var_bin_view.dtype()),
567 }
568}
569
570impl ArrayValidityImpl for VarBinViewArray {
571 fn _is_valid(&self, index: usize) -> VortexResult<bool> {
572 self.validity.is_valid(index)
573 }
574
575 fn _all_valid(&self) -> VortexResult<bool> {
576 self.validity.all_valid()
577 }
578
579 fn _all_invalid(&self) -> VortexResult<bool> {
580 self.validity.all_invalid()
581 }
582
583 fn _validity_mask(&self) -> VortexResult<Mask> {
584 self.validity.to_mask(self.len())
585 }
586}
587
588impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
589 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
590 Self::from_iter_nullable_bin(iter)
591 }
592}
593
594impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
595 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
596 Self::from_iter_nullable_bin(iter)
597 }
598}
599
600impl FromIterator<Option<String>> for VarBinViewArray {
601 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
602 Self::from_iter_nullable_str(iter)
603 }
604}
605
606impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
607 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
608 Self::from_iter_nullable_str(iter)
609 }
610}
611
612#[cfg(test)]
613mod test {
614 use vortex_scalar::Scalar;
615
616 use crate::Canonical;
617 use crate::array::Array;
618 use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
619 use crate::compute::{scalar_at, slice};
620
621 #[test]
622 pub fn varbin_view() {
623 let binary_arr =
624 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
625 assert_eq!(binary_arr.len(), 2);
626 assert_eq!(
627 scalar_at(&binary_arr, 0).unwrap(),
628 Scalar::from("hello world")
629 );
630 assert_eq!(
631 scalar_at(&binary_arr, 1).unwrap(),
632 Scalar::from("hello world this is a long string")
633 );
634 }
635
636 #[test]
637 pub fn slice_array() {
638 let binary_arr = slice(
639 &VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]),
640 1,
641 2,
642 )
643 .unwrap();
644 assert_eq!(
645 scalar_at(&binary_arr, 0).unwrap(),
646 Scalar::from("hello world this is a long string")
647 );
648 }
649
650 #[test]
651 pub fn flatten_array() {
652 let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
653
654 let flattened = binary_arr.to_canonical().unwrap();
655 assert!(matches!(flattened, Canonical::VarBinView(_)));
656
657 let var_bin = flattened.into_varbinview().unwrap().into_array();
658 assert_eq!(scalar_at(&var_bin, 0).unwrap(), Scalar::from("string1"));
659 assert_eq!(scalar_at(&var_bin, 1).unwrap(), Scalar::from("string2"));
660 }
661
662 #[test]
663 pub fn binary_view_size_and_alignment() {
664 assert_eq!(size_of::<BinaryView>(), 16);
665 assert_eq!(align_of::<BinaryView>(), 16);
666 }
667}