1use std::fmt::{Debug, Formatter};
2use std::ops::Range;
3
4use arrow_array::GenericByteViewArray;
5use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder};
6use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType};
7use static_assertions::{assert_eq_align, assert_eq_size};
8use vortex_buffer::{Alignment, Buffer, ByteBuffer};
9use vortex_dtype::DType;
10use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_panic};
11
12use crate::arrow::FromArrowArray;
13use crate::builders::ArrayBuilder;
14use crate::stats::{ArrayStats, StatsSetRef};
15use crate::validity::Validity;
16use crate::vtable::{
17 ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
18 ValidityVTableFromValidityHelper,
19};
20use crate::{ArrayRef, Canonical, EncodingId, EncodingRef, ToCanonical, vtable};
21
22mod accessor;
23mod compute;
24mod ops;
25mod serde;
26
27#[derive(Clone, Copy, Debug, PartialEq, Eq)]
28#[repr(C, align(8))]
29pub struct Inlined {
30 size: u32,
31 data: [u8; BinaryView::MAX_INLINED_SIZE],
32}
33
34impl Inlined {
35 fn new<const N: usize>(value: &[u8]) -> Self {
36 let mut inlined = Self {
37 size: N.try_into().vortex_unwrap(),
38 data: [0u8; BinaryView::MAX_INLINED_SIZE],
39 };
40 inlined.data[..N].copy_from_slice(&value[..N]);
41 inlined
42 }
43
44 #[inline]
45 pub fn value(&self) -> &[u8] {
46 &self.data[0..(self.size as usize)]
47 }
48}
49
50#[derive(Clone, Copy, Debug)]
51#[repr(C, align(8))]
52pub struct Ref {
53 size: u32,
54 prefix: [u8; 4],
55 buffer_index: u32,
56 offset: u32,
57}
58
59impl Ref {
60 pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
61 Self {
62 size,
63 prefix,
64 buffer_index,
65 offset,
66 }
67 }
68
69 #[inline]
70 pub fn buffer_index(&self) -> u32 {
71 self.buffer_index
72 }
73
74 #[inline]
75 pub fn offset(&self) -> u32 {
76 self.offset
77 }
78
79 #[inline]
80 pub fn prefix(&self) -> &[u8; 4] {
81 &self.prefix
82 }
83
84 #[inline]
85 pub fn to_range(&self) -> Range<usize> {
86 self.offset as usize..(self.offset + self.size) as usize
87 }
88}
89
90#[derive(Clone, Copy)]
91#[repr(C, align(16))]
92pub union BinaryView {
93 le_bytes: [u8; 16],
96
97 inlined: Inlined,
99
100 _ref: Ref,
102}
103
104assert_eq_size!(BinaryView, [u8; 16]);
105assert_eq_size!(Inlined, [u8; 16]);
106assert_eq_size!(Ref, [u8; 16]);
107assert_eq_align!(BinaryView, u128);
108
109impl BinaryView {
110 pub const MAX_INLINED_SIZE: usize = 12;
111
112 #[inline(never)]
120 pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
121 match value.len() {
122 0 => Self {
123 inlined: Inlined::new::<0>(value),
124 },
125 1 => Self {
126 inlined: Inlined::new::<1>(value),
127 },
128 2 => Self {
129 inlined: Inlined::new::<2>(value),
130 },
131 3 => Self {
132 inlined: Inlined::new::<3>(value),
133 },
134 4 => Self {
135 inlined: Inlined::new::<4>(value),
136 },
137 5 => Self {
138 inlined: Inlined::new::<5>(value),
139 },
140 6 => Self {
141 inlined: Inlined::new::<6>(value),
142 },
143 7 => Self {
144 inlined: Inlined::new::<7>(value),
145 },
146 8 => Self {
147 inlined: Inlined::new::<8>(value),
148 },
149 9 => Self {
150 inlined: Inlined::new::<9>(value),
151 },
152 10 => Self {
153 inlined: Inlined::new::<10>(value),
154 },
155 11 => Self {
156 inlined: Inlined::new::<11>(value),
157 },
158 12 => Self {
159 inlined: Inlined::new::<12>(value),
160 },
161 _ => Self {
162 _ref: Ref::new(
163 u32::try_from(value.len()).vortex_unwrap(),
164 value[0..4].try_into().vortex_unwrap(),
165 block,
166 offset,
167 ),
168 },
169 }
170 }
171
172 #[inline]
174 pub fn empty_view() -> Self {
175 Self::new_inlined(&[])
176 }
177
178 #[inline]
180 pub fn new_inlined(value: &[u8]) -> Self {
181 assert!(
182 value.len() <= Self::MAX_INLINED_SIZE,
183 "expected inlined value to be <= 12 bytes, was {}",
184 value.len()
185 );
186
187 Self::make_view(value, 0, 0)
188 }
189
190 #[inline]
191 pub fn len(&self) -> u32 {
192 unsafe { self.inlined.size }
193 }
194
195 #[inline]
196 pub fn is_empty(&self) -> bool {
197 self.len() > 0
198 }
199
200 #[inline]
201 #[allow(clippy::cast_possible_truncation)]
202 pub fn is_inlined(&self) -> bool {
203 self.len() <= (Self::MAX_INLINED_SIZE as u32)
204 }
205
206 pub fn as_inlined(&self) -> &Inlined {
207 unsafe { &self.inlined }
208 }
209
210 pub fn as_view(&self) -> &Ref {
211 unsafe { &self._ref }
212 }
213
214 pub fn as_u128(&self) -> u128 {
215 unsafe { u128::from_le_bytes(self.le_bytes) }
217 }
218
219 #[inline(always)]
222 pub fn offset_view(self, offset: u32) -> Self {
223 if self.is_inlined() {
224 self
225 } else {
226 let view_ref = self.as_view();
228 Self {
229 _ref: Ref::new(
230 self.len(),
231 *view_ref.prefix(),
232 offset + view_ref.buffer_index(),
233 view_ref.offset(),
234 ),
235 }
236 }
237 }
238}
239
240impl From<u128> for BinaryView {
241 fn from(value: u128) -> Self {
242 BinaryView {
243 le_bytes: value.to_le_bytes(),
244 }
245 }
246}
247
248impl Debug for BinaryView {
249 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
250 let mut s = f.debug_struct("BinaryView");
251 if self.is_inlined() {
252 s.field("inline", &"i".to_string());
253 } else {
254 s.field("ref", &"r".to_string());
255 }
256 s.finish()
257 }
258}
259
260vtable!(VarBinView);
261
262impl VTable for VarBinViewVTable {
263 type Array = VarBinViewArray;
264 type Encoding = VarBinViewEncoding;
265
266 type ArrayVTable = Self;
267 type CanonicalVTable = Self;
268 type OperationsVTable = Self;
269 type ValidityVTable = ValidityVTableFromValidityHelper;
270 type VisitorVTable = Self;
271 type ComputeVTable = NotSupported;
272 type EncodeVTable = NotSupported;
273 type SerdeVTable = Self;
274
275 fn id(_encoding: &Self::Encoding) -> EncodingId {
276 EncodingId::new_ref("vortex.varbinview")
277 }
278
279 fn encoding(_array: &Self::Array) -> EncodingRef {
280 EncodingRef::new_ref(VarBinViewEncoding.as_ref())
281 }
282}
283
284#[derive(Clone, Debug)]
285pub struct VarBinViewArray {
286 dtype: DType,
287 buffers: Vec<ByteBuffer>,
288 views: Buffer<BinaryView>,
289 validity: Validity,
290 stats_set: ArrayStats,
291}
292
293#[derive(Clone, Debug)]
294pub struct VarBinViewEncoding;
295
296impl VarBinViewArray {
297 pub fn try_new(
298 views: Buffer<BinaryView>,
299 buffers: Vec<ByteBuffer>,
300 dtype: DType,
301 validity: Validity,
302 ) -> VortexResult<Self> {
303 if views.alignment() != Alignment::of::<BinaryView>() {
304 vortex_bail!("Views must be aligned to a 128 bits");
305 }
306
307 if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
308 vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
309 }
310
311 if dtype.is_nullable() == (validity == Validity::NonNullable) {
312 vortex_bail!("incorrect validity {:?}", validity);
313 }
314
315 Ok(Self {
316 dtype,
317 buffers,
318 views,
319 validity,
320 stats_set: Default::default(),
321 })
322 }
323
324 pub fn nbuffers(&self) -> usize {
326 self.buffers.len()
327 }
328
329 #[inline]
335 pub fn views(&self) -> &Buffer<BinaryView> {
336 &self.views
337 }
338
339 #[inline]
343 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
344 let views = self.views();
345 let view = &views[index];
346 if !view.is_inlined() {
348 let view_ref = view.as_view();
349 self.buffer(view_ref.buffer_index() as usize)
350 .slice(view_ref.to_range())
351 } else {
352 views
354 .clone()
355 .into_byte_buffer()
356 .slice_ref(view.as_inlined().value())
357 }
358 }
359
360 #[inline]
367 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
368 if idx >= self.nbuffers() {
369 vortex_panic!(
370 "{idx} buffer index out of bounds, there are {} buffers",
371 self.nbuffers()
372 );
373 }
374 &self.buffers[idx]
375 }
376
377 #[inline]
379 pub fn buffers(&self) -> &[ByteBuffer] {
380 &self.buffers
381 }
382
383 #[allow(clippy::same_name_method)]
385 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
386 iter: I,
387 dtype: DType,
388 ) -> Self {
389 match dtype {
390 DType::Utf8(nullability) => {
391 let string_view_array = generic_byte_view_builder::<StringViewType, _, _>(
392 iter.into_iter(),
393 |builder, v| {
394 match v {
395 None => builder.append_null(),
396 Some(inner) => {
397 let utf8 = unsafe { std::str::from_utf8_unchecked(inner.as_ref()) };
399 builder.append_value(utf8);
400 }
401 }
402 },
403 );
404 ArrayRef::from_arrow(&string_view_array, nullability.into())
405 .to_varbinview()
406 .vortex_expect("StringViewArray to VarBinViewArray downcast")
407 }
408 DType::Binary(nullability) => {
409 let binary_view_array = generic_byte_view_builder::<BinaryViewType, _, _>(
410 iter.into_iter(),
411 GenericByteViewBuilder::append_option,
412 );
413 ArrayRef::from_arrow(&binary_view_array, nullability.into())
414 .to_varbinview()
415 .vortex_expect("BinaryViewArray to VarBinViewArray downcast")
416 }
417 other => vortex_panic!("VarBinViewArray must be Utf8 or Binary, was {other}"),
418 }
419 }
420
421 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
422 let iter = iter.into_iter();
423 let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
424 for s in iter {
425 builder.append_value(s);
426 }
427 ArrayRef::from_arrow(&builder.finish(), false)
428 .to_varbinview()
429 .vortex_expect("VarBinViewArray from StringViewBuilder")
430 }
431
432 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
433 iter: I,
434 ) -> Self {
435 let iter = iter.into_iter();
436 let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
437 builder.extend(iter);
438
439 let array = ArrayRef::from_arrow(&builder.finish(), true);
440 array
441 .to_varbinview()
442 .vortex_expect("VarBinViewArray from StringViewBuilder")
443 }
444
445 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
446 let iter = iter.into_iter();
447 let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
448 for b in iter {
449 builder.append_value(b);
450 }
451 ArrayRef::from_arrow(&builder.finish(), false)
452 .to_varbinview()
453 .vortex_expect("VarBinViewArray from StringViewBuilder")
454 }
455
456 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
457 iter: I,
458 ) -> Self {
459 let iter = iter.into_iter();
460 let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
461 builder.extend(iter);
462 ArrayRef::from_arrow(&builder.finish(), true)
463 .to_varbinview()
464 .vortex_expect("VarBinViewArray from StringViewBuilder")
465 }
466}
467
468fn generic_byte_view_builder<B, V, F>(
470 values: impl Iterator<Item = Option<V>>,
471 mut append_fn: F,
472) -> GenericByteViewArray<B>
473where
474 B: ByteViewType,
475 V: AsRef<[u8]>,
476 F: FnMut(&mut GenericByteViewBuilder<B>, Option<V>),
477{
478 let mut builder = GenericByteViewBuilder::<B>::new();
479
480 for value in values {
481 append_fn(&mut builder, value);
482 }
483
484 builder.finish()
485}
486
487impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
488 fn len(array: &VarBinViewArray) -> usize {
489 array.views.len()
490 }
491
492 fn dtype(array: &VarBinViewArray) -> &DType {
493 &array.dtype
494 }
495
496 fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
497 array.stats_set.to_ref(array.as_ref())
498 }
499}
500
501impl ValidityHelper for VarBinViewArray {
502 fn validity(&self) -> &Validity {
503 &self.validity
504 }
505}
506
507impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
508 fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
509 Ok(Canonical::VarBinView(array.clone()))
510 }
511
512 fn append_to_builder(
513 array: &VarBinViewArray,
514 builder: &mut dyn ArrayBuilder,
515 ) -> VortexResult<()> {
516 builder.extend_from_array(array.as_ref())
517 }
518}
519
520impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
521 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
522 Self::from_iter_nullable_bin(iter)
523 }
524}
525
526impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
527 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
528 Self::from_iter_nullable_bin(iter)
529 }
530}
531
532impl FromIterator<Option<String>> for VarBinViewArray {
533 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
534 Self::from_iter_nullable_str(iter)
535 }
536}
537
538impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
539 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
540 Self::from_iter_nullable_str(iter)
541 }
542}
543
544#[cfg(test)]
545mod test {
546 use vortex_scalar::Scalar;
547
548 use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
549 use crate::{Array, Canonical, IntoArray};
550
551 #[test]
552 pub fn varbin_view() {
553 let binary_arr =
554 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
555 assert_eq!(binary_arr.len(), 2);
556 assert_eq!(
557 binary_arr.scalar_at(0).unwrap(),
558 Scalar::from("hello world")
559 );
560 assert_eq!(
561 binary_arr.scalar_at(1).unwrap(),
562 Scalar::from("hello world this is a long string")
563 );
564 }
565
566 #[test]
567 pub fn slice_array() {
568 let binary_arr =
569 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
570 .slice(1, 2)
571 .unwrap();
572 assert_eq!(
573 binary_arr.scalar_at(0).unwrap(),
574 Scalar::from("hello world this is a long string")
575 );
576 }
577
578 #[test]
579 pub fn flatten_array() {
580 let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
581
582 let flattened = binary_arr.to_canonical().unwrap();
583 assert!(matches!(flattened, Canonical::VarBinView(_)));
584
585 let var_bin = flattened.into_varbinview().unwrap().into_array();
586 assert_eq!(var_bin.scalar_at(0).unwrap(), Scalar::from("string1"));
587 assert_eq!(var_bin.scalar_at(1).unwrap(), Scalar::from("string2"));
588 }
589
590 #[test]
591 pub fn binary_view_size_and_alignment() {
592 assert_eq!(size_of::<BinaryView>(), 16);
593 assert_eq!(align_of::<BinaryView>(), 16);
594 }
595}