1use std::fmt::{Debug, Formatter};
2use std::ops::Range;
3use std::sync::Arc;
4
5use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder};
6use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType};
7use arrow_array::{
8 ArrayRef as ArrowArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArray,
9};
10use arrow_buffer::ScalarBuffer;
11use static_assertions::{assert_eq_align, assert_eq_size};
12use vortex_buffer::{Alignment, Buffer, ByteBuffer};
13use vortex_dtype::DType;
14use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_panic};
15use vortex_mask::Mask;
16
17use crate::array::{ArrayCanonicalImpl, ArrayValidityImpl};
18use crate::arrow::FromArrowArray;
19use crate::builders::ArrayBuilder;
20use crate::stats::{ArrayStats, StatsSetRef};
21use crate::validity::Validity;
22use crate::vtable::{EncodingVTable, VTableRef};
23use crate::{
24 Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, Canonical, EmptyMetadata, Encoding,
25 EncodingId, TryFromArrayRef, try_from_array_ref,
26};
27
28mod accessor;
29mod compute;
30mod serde;
31mod stats;
32mod variants;
33
34#[derive(Clone, Copy, Debug, PartialEq, Eq)]
35#[repr(C, align(8))]
36pub struct Inlined {
37 size: u32,
38 data: [u8; BinaryView::MAX_INLINED_SIZE],
39}
40
41impl Inlined {
42 pub fn new(value: &[u8]) -> Self {
43 assert!(
44 value.len() <= BinaryView::MAX_INLINED_SIZE,
45 "Inlined strings must be shorter than 13 characters, {} given",
46 value.len()
47 );
48 let mut inlined = Self {
49 size: value.len().try_into().vortex_unwrap(),
50 data: [0u8; BinaryView::MAX_INLINED_SIZE],
51 };
52 inlined.data[..value.len()].copy_from_slice(value);
53 inlined
54 }
55
56 #[inline]
57 pub fn value(&self) -> &[u8] {
58 &self.data[0..(self.size as usize)]
59 }
60}
61
62#[derive(Clone, Copy, Debug)]
63#[repr(C, align(8))]
64pub struct Ref {
65 size: u32,
66 prefix: [u8; 4],
67 buffer_index: u32,
68 offset: u32,
69}
70
71impl Ref {
72 pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
73 Self {
74 size,
75 prefix,
76 buffer_index,
77 offset,
78 }
79 }
80
81 #[inline]
82 pub fn buffer_index(&self) -> u32 {
83 self.buffer_index
84 }
85
86 #[inline]
87 pub fn offset(&self) -> u32 {
88 self.offset
89 }
90
91 #[inline]
92 pub fn prefix(&self) -> &[u8; 4] {
93 &self.prefix
94 }
95
96 #[inline]
97 pub fn to_range(&self) -> Range<usize> {
98 self.offset as usize..(self.offset + self.size) as usize
99 }
100}
101
102#[derive(Clone, Copy)]
103#[repr(C, align(16))]
104pub union BinaryView {
105 le_bytes: [u8; 16],
108
109 inlined: Inlined,
111
112 _ref: Ref,
114}
115
116assert_eq_size!(BinaryView, [u8; 16]);
117assert_eq_size!(Inlined, [u8; 16]);
118assert_eq_size!(Ref, [u8; 16]);
119assert_eq_align!(BinaryView, u128);
120
121impl BinaryView {
122 pub const MAX_INLINED_SIZE: usize = 12;
123
124 pub fn empty_view() -> Self {
125 Self {
126 inlined: Inlined::new(&[]),
127 }
128 }
129
130 pub fn new_inlined(value: &[u8]) -> Self {
131 assert!(
132 value.len() <= Self::MAX_INLINED_SIZE,
133 "expected inlined value to be <= 12 bytes, was {}",
134 value.len()
135 );
136
137 Self {
138 inlined: Inlined::new(value),
139 }
140 }
141
142 pub fn new_view(len: u32, prefix: [u8; 4], block: u32, offset: u32) -> Self {
144 Self {
145 _ref: Ref::new(len, prefix, block, offset),
146 }
147 }
148
149 #[inline]
150 pub fn len(&self) -> u32 {
151 unsafe { self.inlined.size }
152 }
153
154 #[inline]
155 pub fn is_empty(&self) -> bool {
156 self.len() > 0
157 }
158
159 #[inline]
160 #[allow(clippy::cast_possible_truncation)]
161 pub fn is_inlined(&self) -> bool {
162 self.len() <= (Self::MAX_INLINED_SIZE as u32)
163 }
164
165 pub fn as_inlined(&self) -> &Inlined {
166 unsafe { &self.inlined }
167 }
168
169 pub fn as_view(&self) -> &Ref {
170 unsafe { &self._ref }
171 }
172
173 pub fn as_u128(&self) -> u128 {
174 unsafe { u128::from_le_bytes(self.le_bytes) }
176 }
177
178 #[inline(always)]
181 pub fn offset_view(self, offset: u32) -> Self {
182 if self.is_inlined() {
183 self
184 } else {
185 let view_ref = self.as_view();
187 BinaryView::new_view(
188 self.len(),
189 *view_ref.prefix(),
190 offset + view_ref.buffer_index(),
191 view_ref.offset(),
192 )
193 }
194 }
195}
196
197impl From<u128> for BinaryView {
198 fn from(value: u128) -> Self {
199 BinaryView {
200 le_bytes: value.to_le_bytes(),
201 }
202 }
203}
204
205impl Debug for BinaryView {
206 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
207 let mut s = f.debug_struct("BinaryView");
208 if self.is_inlined() {
209 s.field("inline", &"i".to_string());
210 } else {
211 s.field("ref", &"r".to_string());
212 }
213 s.finish()
214 }
215}
216
217#[derive(Clone, Debug)]
218pub struct VarBinViewArray {
219 dtype: DType,
220 buffers: Vec<ByteBuffer>,
221 views: Buffer<BinaryView>,
222 validity: Validity,
223 stats_set: ArrayStats,
224}
225
226try_from_array_ref!(VarBinViewArray);
227
228pub struct VarBinViewEncoding;
229impl Encoding for VarBinViewEncoding {
230 type Array = VarBinViewArray;
231 type Metadata = EmptyMetadata;
232}
233
234impl EncodingVTable for VarBinViewEncoding {
235 fn id(&self) -> EncodingId {
236 EncodingId::new_ref("vortex.varbinview")
237 }
238}
239
240impl VarBinViewArray {
241 pub fn try_new(
242 views: Buffer<BinaryView>,
243 buffers: Vec<ByteBuffer>,
244 dtype: DType,
245 validity: Validity,
246 ) -> VortexResult<Self> {
247 if views.alignment() != Alignment::of::<BinaryView>() {
248 vortex_bail!("Views must be aligned to a 128 bits");
249 }
250
251 if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
252 vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
253 }
254
255 if dtype.is_nullable() == (validity == Validity::NonNullable) {
256 vortex_bail!("incorrect validity {:?}", validity);
257 }
258
259 Ok(Self {
260 dtype,
261 buffers,
262 views,
263 validity,
264 stats_set: Default::default(),
265 })
266 }
267
268 pub fn nbuffers(&self) -> usize {
270 self.buffers.len()
271 }
272
273 #[inline]
279 pub fn views(&self) -> &Buffer<BinaryView> {
280 &self.views
281 }
282
283 #[inline]
287 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
288 let views = self.views();
289 let view = &views[index];
290 if !view.is_inlined() {
292 let view_ref = view.as_view();
293 self.buffer(view_ref.buffer_index() as usize)
294 .slice(view_ref.to_range())
295 } else {
296 views
298 .clone()
299 .into_byte_buffer()
300 .slice_ref(view.as_inlined().value())
301 }
302 }
303
304 #[inline]
311 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
312 if idx >= self.nbuffers() {
313 vortex_panic!(
314 "{idx} buffer index out of bounds, there are {} buffers",
315 self.nbuffers()
316 );
317 }
318 &self.buffers[idx]
319 }
320
321 #[inline]
323 pub fn buffers(&self) -> &[ByteBuffer] {
324 &self.buffers
325 }
326
327 pub fn validity(&self) -> &Validity {
329 &self.validity
330 }
331
332 #[allow(clippy::same_name_method)]
334 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
335 iter: I,
336 dtype: DType,
337 ) -> Self {
338 match dtype {
339 DType::Utf8(nullability) => {
340 let string_view_array = generic_byte_view_builder::<StringViewType, _, _>(
341 iter.into_iter(),
342 |builder, v| {
343 match v {
344 None => builder.append_null(),
345 Some(inner) => {
346 let utf8 = unsafe { std::str::from_utf8_unchecked(inner.as_ref()) };
348 builder.append_value(utf8);
349 }
350 }
351 },
352 );
353 VarBinViewArray::try_from_array(ArrayRef::from_arrow(
354 &string_view_array,
355 nullability.into(),
356 ))
357 .vortex_expect("StringViewArray to VarBinViewArray downcast")
358 }
359 DType::Binary(nullability) => {
360 let binary_view_array = generic_byte_view_builder::<BinaryViewType, _, _>(
361 iter.into_iter(),
362 GenericByteViewBuilder::append_option,
363 );
364 VarBinViewArray::try_from_array(ArrayRef::from_arrow(
365 &binary_view_array,
366 nullability.into(),
367 ))
368 .vortex_expect("BinaryViewArray to VarBinViewArray downcast")
369 }
370 other => vortex_panic!("VarBinViewArray must be Utf8 or Binary, was {other}"),
371 }
372 }
373
374 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
375 let iter = iter.into_iter();
376 let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
377 for s in iter {
378 builder.append_value(s);
379 }
380 let array = ArrayRef::from_arrow(&builder.finish(), false);
381 VarBinViewArray::try_from_array(array)
382 .vortex_expect("VarBinViewArray from StringViewBuilder")
383 }
384
385 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
386 iter: I,
387 ) -> Self {
388 let iter = iter.into_iter();
389 let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
390 builder.extend(iter);
391
392 let array = ArrayRef::from_arrow(&builder.finish(), true);
393 VarBinViewArray::try_from_array(array)
394 .vortex_expect("VarBinViewArray from StringViewBuilder")
395 }
396
397 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
398 let iter = iter.into_iter();
399 let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
400 for b in iter {
401 builder.append_value(b);
402 }
403 let array = ArrayRef::from_arrow(&builder.finish(), false);
404 VarBinViewArray::try_from_array(array)
405 .vortex_expect("VarBinViewArray from StringViewBuilder")
406 }
407
408 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
409 iter: I,
410 ) -> Self {
411 let iter = iter.into_iter();
412 let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
413 builder.extend(iter);
414 let array = ArrayRef::from_arrow(&builder.finish(), true);
415 VarBinViewArray::try_from_array(array)
416 .vortex_expect("VarBinViewArray from StringViewBuilder")
417 }
418}
419
420fn generic_byte_view_builder<B, V, F>(
422 values: impl Iterator<Item = Option<V>>,
423 mut append_fn: F,
424) -> GenericByteViewArray<B>
425where
426 B: ByteViewType,
427 V: AsRef<[u8]>,
428 F: FnMut(&mut GenericByteViewBuilder<B>, Option<V>),
429{
430 let mut builder = GenericByteViewBuilder::<B>::new();
431
432 for value in values {
433 append_fn(&mut builder, value);
434 }
435
436 builder.finish()
437}
438
439impl ArrayImpl for VarBinViewArray {
440 type Encoding = VarBinViewEncoding;
441
442 fn _len(&self) -> usize {
443 self.views.len()
444 }
445
446 fn _dtype(&self) -> &DType {
447 &self.dtype
448 }
449
450 fn _vtable(&self) -> VTableRef {
451 VTableRef::new_ref(&VarBinViewEncoding)
452 }
453}
454
455impl ArrayStatisticsImpl for VarBinViewArray {
456 fn _stats_ref(&self) -> StatsSetRef<'_> {
457 self.stats_set.to_ref(self)
458 }
459}
460
461impl ArrayCanonicalImpl for VarBinViewArray {
462 fn _to_canonical(&self) -> VortexResult<Canonical> {
463 Ok(Canonical::VarBinView(self.clone()))
464 }
465
466 fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
467 builder.extend_from_array(self)
468 }
469}
470
471pub(crate) fn varbinview_as_arrow(var_bin_view: &VarBinViewArray) -> ArrowArrayRef {
472 let views = var_bin_view.views().clone();
473
474 let nulls = var_bin_view
475 .validity_mask()
476 .vortex_expect("VarBinViewArray: failed to get logical validity")
477 .to_null_buffer();
478
479 let data = (0..var_bin_view.nbuffers())
480 .map(|i| var_bin_view.buffer(i))
481 .collect::<Vec<_>>();
482
483 let data = data
484 .into_iter()
485 .map(|p| p.clone().into_arrow_buffer())
486 .collect::<Vec<_>>();
487
488 match var_bin_view.dtype() {
490 DType::Binary(_) => Arc::new(unsafe {
491 BinaryViewArray::new_unchecked(
492 ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
493 data,
494 nulls,
495 )
496 }),
497 DType::Utf8(_) => Arc::new(unsafe {
498 StringViewArray::new_unchecked(
499 ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
500 data,
501 nulls,
502 )
503 }),
504 _ => vortex_panic!("expected utf8 or binary, got {}", var_bin_view.dtype()),
505 }
506}
507
508impl ArrayValidityImpl for VarBinViewArray {
509 fn _is_valid(&self, index: usize) -> VortexResult<bool> {
510 self.validity.is_valid(index)
511 }
512
513 fn _all_valid(&self) -> VortexResult<bool> {
514 self.validity.all_valid()
515 }
516
517 fn _all_invalid(&self) -> VortexResult<bool> {
518 self.validity.all_invalid()
519 }
520
521 fn _validity_mask(&self) -> VortexResult<Mask> {
522 self.validity.to_logical(self.len())
523 }
524}
525
526impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
527 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
528 Self::from_iter_nullable_bin(iter)
529 }
530}
531
532impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
533 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
534 Self::from_iter_nullable_bin(iter)
535 }
536}
537
538impl FromIterator<Option<String>> for VarBinViewArray {
539 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
540 Self::from_iter_nullable_str(iter)
541 }
542}
543
544impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
545 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
546 Self::from_iter_nullable_str(iter)
547 }
548}
549
550#[cfg(test)]
551mod test {
552 use vortex_scalar::Scalar;
553
554 use crate::Canonical;
555 use crate::array::Array;
556 use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
557 use crate::compute::{scalar_at, slice};
558
559 #[test]
560 pub fn varbin_view() {
561 let binary_arr =
562 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
563 assert_eq!(binary_arr.len(), 2);
564 assert_eq!(
565 scalar_at(&binary_arr, 0).unwrap(),
566 Scalar::from("hello world")
567 );
568 assert_eq!(
569 scalar_at(&binary_arr, 1).unwrap(),
570 Scalar::from("hello world this is a long string")
571 );
572 }
573
574 #[test]
575 pub fn slice_array() {
576 let binary_arr = slice(
577 &VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]),
578 1,
579 2,
580 )
581 .unwrap();
582 assert_eq!(
583 scalar_at(&binary_arr, 0).unwrap(),
584 Scalar::from("hello world this is a long string")
585 );
586 }
587
588 #[test]
589 pub fn flatten_array() {
590 let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
591
592 let flattened = binary_arr.to_canonical().unwrap();
593 assert!(matches!(flattened, Canonical::VarBinView(_)));
594
595 let var_bin = flattened.into_varbinview().unwrap().into_array();
596 assert_eq!(scalar_at(&var_bin, 0).unwrap(), Scalar::from("string1"));
597 assert_eq!(scalar_at(&var_bin, 1).unwrap(), Scalar::from("string2"));
598 }
599
600 #[test]
601 pub fn binary_view_size_and_alignment() {
602 assert_eq!(size_of::<BinaryView>(), 16);
603 assert_eq!(align_of::<BinaryView>(), 16);
604 }
605}