1use std::fmt::{Debug, Formatter};
5use std::ops::Range;
6use std::sync::Arc;
7
8use static_assertions::{assert_eq_align, assert_eq_size};
9use vortex_buffer::{Alignment, Buffer, ByteBuffer};
10use vortex_dtype::{DType, Nullability};
11use vortex_error::{VortexResult, VortexUnwrap, vortex_bail, vortex_panic};
12
13use crate::builders::{ArrayBuilder, VarBinViewBuilder};
14use crate::stats::{ArrayStats, StatsSetRef};
15use crate::validity::Validity;
16use crate::vtable::{
17 ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
18 ValidityVTableFromValidityHelper,
19};
20use crate::{Canonical, EncodingId, EncodingRef, vtable};
21
22mod accessor;
23mod compact;
24mod compute;
25mod ops;
26mod serde;
27
28#[derive(Clone, Copy, Debug, PartialEq, Eq)]
29#[repr(C, align(8))]
30pub struct Inlined {
31 size: u32,
32 data: [u8; BinaryView::MAX_INLINED_SIZE],
33}
34
35impl Inlined {
36 fn new<const N: usize>(value: &[u8]) -> Self {
37 let mut inlined = Self {
38 size: N.try_into().vortex_unwrap(),
39 data: [0u8; BinaryView::MAX_INLINED_SIZE],
40 };
41 inlined.data[..N].copy_from_slice(&value[..N]);
42 inlined
43 }
44
45 #[inline]
46 pub fn value(&self) -> &[u8] {
47 &self.data[0..(self.size as usize)]
48 }
49}
50
51#[derive(Clone, Copy, Debug)]
52#[repr(C, align(8))]
53pub struct Ref {
54 size: u32,
55 prefix: [u8; 4],
56 buffer_index: u32,
57 offset: u32,
58}
59
60impl Ref {
61 pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
62 Self {
63 size,
64 prefix,
65 buffer_index,
66 offset,
67 }
68 }
69
70 #[inline]
71 pub fn buffer_index(&self) -> u32 {
72 self.buffer_index
73 }
74
75 #[inline]
76 pub fn offset(&self) -> u32 {
77 self.offset
78 }
79
80 #[inline]
81 pub fn prefix(&self) -> &[u8; 4] {
82 &self.prefix
83 }
84
85 #[inline]
86 pub fn to_range(&self) -> Range<usize> {
87 self.offset as usize..(self.offset + self.size) as usize
88 }
89}
90
91#[derive(Clone, Copy)]
92#[repr(C, align(16))]
93pub union BinaryView {
94 le_bytes: [u8; 16],
97
98 inlined: Inlined,
100
101 _ref: Ref,
103}
104
105assert_eq_size!(BinaryView, [u8; 16]);
106assert_eq_size!(Inlined, [u8; 16]);
107assert_eq_size!(Ref, [u8; 16]);
108assert_eq_align!(BinaryView, u128);
109
110impl BinaryView {
111 pub const MAX_INLINED_SIZE: usize = 12;
112
113 #[inline(never)]
121 pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
122 match value.len() {
123 0 => Self {
124 inlined: Inlined::new::<0>(value),
125 },
126 1 => Self {
127 inlined: Inlined::new::<1>(value),
128 },
129 2 => Self {
130 inlined: Inlined::new::<2>(value),
131 },
132 3 => Self {
133 inlined: Inlined::new::<3>(value),
134 },
135 4 => Self {
136 inlined: Inlined::new::<4>(value),
137 },
138 5 => Self {
139 inlined: Inlined::new::<5>(value),
140 },
141 6 => Self {
142 inlined: Inlined::new::<6>(value),
143 },
144 7 => Self {
145 inlined: Inlined::new::<7>(value),
146 },
147 8 => Self {
148 inlined: Inlined::new::<8>(value),
149 },
150 9 => Self {
151 inlined: Inlined::new::<9>(value),
152 },
153 10 => Self {
154 inlined: Inlined::new::<10>(value),
155 },
156 11 => Self {
157 inlined: Inlined::new::<11>(value),
158 },
159 12 => Self {
160 inlined: Inlined::new::<12>(value),
161 },
162 _ => Self {
163 _ref: Ref::new(
164 u32::try_from(value.len()).vortex_unwrap(),
165 value[0..4].try_into().vortex_unwrap(),
166 block,
167 offset,
168 ),
169 },
170 }
171 }
172
173 #[inline]
175 pub fn empty_view() -> Self {
176 Self::new_inlined(&[])
177 }
178
179 #[inline]
181 pub fn new_inlined(value: &[u8]) -> Self {
182 assert!(
183 value.len() <= Self::MAX_INLINED_SIZE,
184 "expected inlined value to be <= 12 bytes, was {}",
185 value.len()
186 );
187
188 Self::make_view(value, 0, 0)
189 }
190
191 #[inline]
192 pub fn len(&self) -> u32 {
193 unsafe { self.inlined.size }
194 }
195
196 #[inline]
197 pub fn is_empty(&self) -> bool {
198 self.len() > 0
199 }
200
201 #[inline]
202 #[allow(clippy::cast_possible_truncation)]
203 pub fn is_inlined(&self) -> bool {
204 self.len() <= (Self::MAX_INLINED_SIZE as u32)
205 }
206
207 pub fn as_inlined(&self) -> &Inlined {
208 unsafe { &self.inlined }
209 }
210
211 pub fn as_view(&self) -> &Ref {
212 unsafe { &self._ref }
213 }
214
215 pub fn as_u128(&self) -> u128 {
216 unsafe { u128::from_le_bytes(self.le_bytes) }
218 }
219
220 #[inline(always)]
222 pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
223 if self.is_inlined() {
224 self
225 } else {
226 let view_ref = self.as_view();
228 Self {
229 _ref: Ref::new(
230 self.len(),
231 *view_ref.prefix(),
232 buffer_idx,
233 view_ref.offset(),
234 ),
235 }
236 }
237 }
238
239 #[inline(always)]
242 pub fn offset_view(self, offset: u32) -> Self {
243 if self.is_inlined() {
244 self
245 } else {
246 let view_ref = self.as_view();
248 Self {
249 _ref: Ref::new(
250 self.len(),
251 *view_ref.prefix(),
252 offset + view_ref.buffer_index(),
253 view_ref.offset(),
254 ),
255 }
256 }
257 }
258}
259
260impl From<u128> for BinaryView {
261 fn from(value: u128) -> Self {
262 BinaryView {
263 le_bytes: value.to_le_bytes(),
264 }
265 }
266}
267
268impl Debug for BinaryView {
269 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
270 let mut s = f.debug_struct("BinaryView");
271 if self.is_inlined() {
272 s.field("inline", &"i".to_string());
273 } else {
274 s.field("ref", &"r".to_string());
275 }
276 s.finish()
277 }
278}
279
280vtable!(VarBinView);
281
282impl VTable for VarBinViewVTable {
283 type Array = VarBinViewArray;
284 type Encoding = VarBinViewEncoding;
285
286 type ArrayVTable = Self;
287 type CanonicalVTable = Self;
288 type OperationsVTable = Self;
289 type ValidityVTable = ValidityVTableFromValidityHelper;
290 type VisitorVTable = Self;
291 type ComputeVTable = NotSupported;
292 type EncodeVTable = NotSupported;
293 type SerdeVTable = Self;
294
295 fn id(_encoding: &Self::Encoding) -> EncodingId {
296 EncodingId::new_ref("vortex.varbinview")
297 }
298
299 fn encoding(_array: &Self::Array) -> EncodingRef {
300 EncodingRef::new_ref(VarBinViewEncoding.as_ref())
301 }
302}
303
304#[derive(Clone, Debug)]
364pub struct VarBinViewArray {
365 dtype: DType,
366 buffers: Arc<[ByteBuffer]>,
367 views: Buffer<BinaryView>,
368 validity: Validity,
369 stats_set: ArrayStats,
370}
371
372#[derive(Clone, Debug)]
373pub struct VarBinViewEncoding;
374
375impl VarBinViewArray {
376 pub fn try_new(
377 views: Buffer<BinaryView>,
378 buffers: Arc<[ByteBuffer]>,
379 dtype: DType,
380 validity: Validity,
381 ) -> VortexResult<Self> {
382 if views.alignment() != Alignment::of::<BinaryView>() {
383 vortex_bail!("Views must be aligned to a 128 bits");
384 }
385
386 if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
387 vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
388 }
389
390 if dtype.is_nullable() == (validity == Validity::NonNullable) {
391 vortex_bail!("incorrect validity {:?}", validity);
392 }
393
394 Ok(Self {
395 dtype,
396 buffers,
397 views,
398 validity,
399 stats_set: Default::default(),
400 })
401 }
402
403 pub fn nbuffers(&self) -> usize {
405 self.buffers.len()
406 }
407
408 #[inline]
414 pub fn views(&self) -> &Buffer<BinaryView> {
415 &self.views
416 }
417
418 #[inline]
422 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
423 let views = self.views();
424 let view = &views[index];
425 if !view.is_inlined() {
427 let view_ref = view.as_view();
428 self.buffer(view_ref.buffer_index() as usize)
429 .slice(view_ref.to_range())
430 } else {
431 views
433 .clone()
434 .into_byte_buffer()
435 .slice_ref(view.as_inlined().value())
436 }
437 }
438
439 #[inline]
446 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
447 if idx >= self.nbuffers() {
448 vortex_panic!(
449 "{idx} buffer index out of bounds, there are {} buffers",
450 self.nbuffers()
451 );
452 }
453 &self.buffers[idx]
454 }
455
456 #[inline]
458 pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
459 &self.buffers
460 }
461
462 #[allow(clippy::same_name_method)]
464 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
465 iter: I,
466 dtype: DType,
467 ) -> Self {
468 let iter = iter.into_iter();
469 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
470
471 for item in iter {
472 match item {
473 None => builder.append_null(),
474 Some(v) => builder.append_value(v),
475 }
476 }
477
478 builder.finish_into_varbinview()
479 }
480
481 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
482 let iter = iter.into_iter();
483 let mut builder = VarBinViewBuilder::with_capacity(
484 DType::Utf8(Nullability::NonNullable),
485 iter.size_hint().0,
486 );
487
488 for item in iter {
489 builder.append_value(item.as_ref());
490 }
491
492 builder.finish_into_varbinview()
493 }
494
495 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
496 iter: I,
497 ) -> Self {
498 let iter = iter.into_iter();
499 let mut builder = VarBinViewBuilder::with_capacity(
500 DType::Utf8(Nullability::Nullable),
501 iter.size_hint().0,
502 );
503
504 for item in iter {
505 match item {
506 None => builder.append_null(),
507 Some(v) => builder.append_value(v.as_ref()),
508 }
509 }
510
511 builder.finish_into_varbinview()
512 }
513
514 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
515 let iter = iter.into_iter();
516 let mut builder = VarBinViewBuilder::with_capacity(
517 DType::Binary(Nullability::NonNullable),
518 iter.size_hint().0,
519 );
520
521 for item in iter {
522 builder.append_value(item.as_ref());
523 }
524
525 builder.finish_into_varbinview()
526 }
527
528 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
529 iter: I,
530 ) -> Self {
531 let iter = iter.into_iter();
532 let mut builder = VarBinViewBuilder::with_capacity(
533 DType::Binary(Nullability::Nullable),
534 iter.size_hint().0,
535 );
536
537 for item in iter {
538 match item {
539 None => builder.append_null(),
540 Some(v) => builder.append_value(v.as_ref()),
541 }
542 }
543
544 builder.finish_into_varbinview()
545 }
546}
547
548impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
549 fn len(array: &VarBinViewArray) -> usize {
550 array.views.len()
551 }
552
553 fn dtype(array: &VarBinViewArray) -> &DType {
554 &array.dtype
555 }
556
557 fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
558 array.stats_set.to_ref(array.as_ref())
559 }
560}
561
562impl ValidityHelper for VarBinViewArray {
563 fn validity(&self) -> &Validity {
564 &self.validity
565 }
566}
567
568impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
569 fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
570 Ok(Canonical::VarBinView(array.clone()))
571 }
572
573 fn append_to_builder(
574 array: &VarBinViewArray,
575 builder: &mut dyn ArrayBuilder,
576 ) -> VortexResult<()> {
577 builder.extend_from_array(array.as_ref())
578 }
579}
580
581impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
582 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
583 Self::from_iter_nullable_bin(iter)
584 }
585}
586
587impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
588 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
589 Self::from_iter_nullable_bin(iter)
590 }
591}
592
593impl FromIterator<Option<String>> for VarBinViewArray {
594 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
595 Self::from_iter_nullable_str(iter)
596 }
597}
598
599impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
600 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
601 Self::from_iter_nullable_str(iter)
602 }
603}
604
605#[cfg(test)]
606mod test {
607 use vortex_scalar::Scalar;
608
609 use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
610 use crate::{Array, Canonical, IntoArray};
611
612 #[test]
613 pub fn varbin_view() {
614 let binary_arr =
615 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
616 assert_eq!(binary_arr.len(), 2);
617 assert_eq!(
618 binary_arr.scalar_at(0).unwrap(),
619 Scalar::from("hello world")
620 );
621 assert_eq!(
622 binary_arr.scalar_at(1).unwrap(),
623 Scalar::from("hello world this is a long string")
624 );
625 }
626
627 #[test]
628 pub fn slice_array() {
629 let binary_arr =
630 VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
631 .slice(1, 2)
632 .unwrap();
633 assert_eq!(
634 binary_arr.scalar_at(0).unwrap(),
635 Scalar::from("hello world this is a long string")
636 );
637 }
638
639 #[test]
640 pub fn flatten_array() {
641 let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
642
643 let flattened = binary_arr.to_canonical().unwrap();
644 assert!(matches!(flattened, Canonical::VarBinView(_)));
645
646 let var_bin = flattened.into_varbinview().unwrap().into_array();
647 assert_eq!(var_bin.scalar_at(0).unwrap(), Scalar::from("string1"));
648 assert_eq!(var_bin.scalar_at(1).unwrap(), Scalar::from("string2"));
649 }
650
651 #[test]
652 pub fn binary_view_size_and_alignment() {
653 assert_eq!(size_of::<BinaryView>(), 16);
654 assert_eq!(align_of::<BinaryView>(), 16);
655 }
656}