vortex_array/arrays/varbinview/
mod.rs

1use std::fmt::{Debug, Formatter};
2use std::ops::Range;
3
4use arrow_array::GenericByteViewArray;
5use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder};
6use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType};
7use static_assertions::{assert_eq_align, assert_eq_size};
8use vortex_buffer::{Alignment, Buffer, ByteBuffer};
9use vortex_dtype::DType;
10use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_panic};
11
12use crate::arrow::FromArrowArray;
13use crate::builders::ArrayBuilder;
14use crate::stats::{ArrayStats, StatsSetRef};
15use crate::validity::Validity;
16use crate::vtable::{
17    ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
18    ValidityVTableFromValidityHelper,
19};
20use crate::{ArrayRef, Canonical, EncodingId, EncodingRef, ToCanonical, vtable};
21
22mod accessor;
23mod compute;
24mod ops;
25mod serde;
26
27#[derive(Clone, Copy, Debug, PartialEq, Eq)]
28#[repr(C, align(8))]
29pub struct Inlined {
30    size: u32,
31    data: [u8; BinaryView::MAX_INLINED_SIZE],
32}
33
34impl Inlined {
35    fn new<const N: usize>(value: &[u8]) -> Self {
36        let mut inlined = Self {
37            size: N.try_into().vortex_unwrap(),
38            data: [0u8; BinaryView::MAX_INLINED_SIZE],
39        };
40        inlined.data[..N].copy_from_slice(&value[..N]);
41        inlined
42    }
43
44    #[inline]
45    pub fn value(&self) -> &[u8] {
46        &self.data[0..(self.size as usize)]
47    }
48}
49
50#[derive(Clone, Copy, Debug)]
51#[repr(C, align(8))]
52pub struct Ref {
53    size: u32,
54    prefix: [u8; 4],
55    buffer_index: u32,
56    offset: u32,
57}
58
59impl Ref {
60    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
61        Self {
62            size,
63            prefix,
64            buffer_index,
65            offset,
66        }
67    }
68
69    #[inline]
70    pub fn buffer_index(&self) -> u32 {
71        self.buffer_index
72    }
73
74    #[inline]
75    pub fn offset(&self) -> u32 {
76        self.offset
77    }
78
79    #[inline]
80    pub fn prefix(&self) -> &[u8; 4] {
81        &self.prefix
82    }
83
84    #[inline]
85    pub fn to_range(&self) -> Range<usize> {
86        self.offset as usize..(self.offset + self.size) as usize
87    }
88}
89
90#[derive(Clone, Copy)]
91#[repr(C, align(16))]
92pub union BinaryView {
93    // Numeric representation. This is logically `u128`, but we split it into the high and low
94    // bits to preserve the alignment.
95    le_bytes: [u8; 16],
96
97    // Inlined representation: strings <= 12 bytes
98    inlined: Inlined,
99
100    // Reference type: strings > 12 bytes.
101    _ref: Ref,
102}
103
104assert_eq_size!(BinaryView, [u8; 16]);
105assert_eq_size!(Inlined, [u8; 16]);
106assert_eq_size!(Ref, [u8; 16]);
107assert_eq_align!(BinaryView, u128);
108
109impl BinaryView {
110    pub const MAX_INLINED_SIZE: usize = 12;
111
112    /// Create a view from a value, block and offset
113    ///
114    /// Depending on the length of the provided value either a new inlined
115    /// or a reference view will be constructed.
116    ///
117    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
118    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
119    #[inline(never)]
120    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
121        match value.len() {
122            0 => Self {
123                inlined: Inlined::new::<0>(value),
124            },
125            1 => Self {
126                inlined: Inlined::new::<1>(value),
127            },
128            2 => Self {
129                inlined: Inlined::new::<2>(value),
130            },
131            3 => Self {
132                inlined: Inlined::new::<3>(value),
133            },
134            4 => Self {
135                inlined: Inlined::new::<4>(value),
136            },
137            5 => Self {
138                inlined: Inlined::new::<5>(value),
139            },
140            6 => Self {
141                inlined: Inlined::new::<6>(value),
142            },
143            7 => Self {
144                inlined: Inlined::new::<7>(value),
145            },
146            8 => Self {
147                inlined: Inlined::new::<8>(value),
148            },
149            9 => Self {
150                inlined: Inlined::new::<9>(value),
151            },
152            10 => Self {
153                inlined: Inlined::new::<10>(value),
154            },
155            11 => Self {
156                inlined: Inlined::new::<11>(value),
157            },
158            12 => Self {
159                inlined: Inlined::new::<12>(value),
160            },
161            _ => Self {
162                _ref: Ref::new(
163                    u32::try_from(value.len()).vortex_unwrap(),
164                    value[0..4].try_into().vortex_unwrap(),
165                    block,
166                    offset,
167                ),
168            },
169        }
170    }
171
172    /// Create a new empty view
173    #[inline]
174    pub fn empty_view() -> Self {
175        Self::new_inlined(&[])
176    }
177
178    /// Create a new inlined binary view
179    #[inline]
180    pub fn new_inlined(value: &[u8]) -> Self {
181        assert!(
182            value.len() <= Self::MAX_INLINED_SIZE,
183            "expected inlined value to be <= 12 bytes, was {}",
184            value.len()
185        );
186
187        Self::make_view(value, 0, 0)
188    }
189
190    #[inline]
191    pub fn len(&self) -> u32 {
192        unsafe { self.inlined.size }
193    }
194
195    #[inline]
196    pub fn is_empty(&self) -> bool {
197        self.len() > 0
198    }
199
200    #[inline]
201    #[allow(clippy::cast_possible_truncation)]
202    pub fn is_inlined(&self) -> bool {
203        self.len() <= (Self::MAX_INLINED_SIZE as u32)
204    }
205
206    pub fn as_inlined(&self) -> &Inlined {
207        unsafe { &self.inlined }
208    }
209
210    pub fn as_view(&self) -> &Ref {
211        unsafe { &self._ref }
212    }
213
214    pub fn as_u128(&self) -> u128 {
215        // SAFETY: binary view always safe to read as u128 LE bytes
216        unsafe { u128::from_le_bytes(self.le_bytes) }
217    }
218
219    /// Shifts the buffer reference by the view by a given offset, useful when merging many
220    /// varbinview arrays into one.
221    #[inline(always)]
222    pub fn offset_view(self, offset: u32) -> Self {
223        if self.is_inlined() {
224            self
225        } else {
226            // Referencing views must have their buffer_index adjusted with new offsets
227            let view_ref = self.as_view();
228            Self {
229                _ref: Ref::new(
230                    self.len(),
231                    *view_ref.prefix(),
232                    offset + view_ref.buffer_index(),
233                    view_ref.offset(),
234                ),
235            }
236        }
237    }
238}
239
240impl From<u128> for BinaryView {
241    fn from(value: u128) -> Self {
242        BinaryView {
243            le_bytes: value.to_le_bytes(),
244        }
245    }
246}
247
248impl Debug for BinaryView {
249    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
250        let mut s = f.debug_struct("BinaryView");
251        if self.is_inlined() {
252            s.field("inline", &"i".to_string());
253        } else {
254            s.field("ref", &"r".to_string());
255        }
256        s.finish()
257    }
258}
259
260vtable!(VarBinView);
261
262impl VTable for VarBinViewVTable {
263    type Array = VarBinViewArray;
264    type Encoding = VarBinViewEncoding;
265
266    type ArrayVTable = Self;
267    type CanonicalVTable = Self;
268    type OperationsVTable = Self;
269    type ValidityVTable = ValidityVTableFromValidityHelper;
270    type VisitorVTable = Self;
271    type ComputeVTable = NotSupported;
272    type EncodeVTable = NotSupported;
273    type SerdeVTable = Self;
274
275    fn id(_encoding: &Self::Encoding) -> EncodingId {
276        EncodingId::new_ref("vortex.varbinview")
277    }
278
279    fn encoding(_array: &Self::Array) -> EncodingRef {
280        EncodingRef::new_ref(VarBinViewEncoding.as_ref())
281    }
282}
283
284#[derive(Clone, Debug)]
285pub struct VarBinViewArray {
286    dtype: DType,
287    buffers: Vec<ByteBuffer>,
288    views: Buffer<BinaryView>,
289    validity: Validity,
290    stats_set: ArrayStats,
291}
292
293#[derive(Clone, Debug)]
294pub struct VarBinViewEncoding;
295
296impl VarBinViewArray {
297    pub fn try_new(
298        views: Buffer<BinaryView>,
299        buffers: Vec<ByteBuffer>,
300        dtype: DType,
301        validity: Validity,
302    ) -> VortexResult<Self> {
303        if views.alignment() != Alignment::of::<BinaryView>() {
304            vortex_bail!("Views must be aligned to a 128 bits");
305        }
306
307        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
308            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
309        }
310
311        if dtype.is_nullable() == (validity == Validity::NonNullable) {
312            vortex_bail!("incorrect validity {:?}", validity);
313        }
314
315        Ok(Self {
316            dtype,
317            buffers,
318            views,
319            validity,
320            stats_set: Default::default(),
321        })
322    }
323
324    /// Number of raw string data buffers held by this array.
325    pub fn nbuffers(&self) -> usize {
326        self.buffers.len()
327    }
328
329    /// Access to the primitive views buffer.
330    ///
331    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
332    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
333    /// the string (if the string has 12 bytes or fewer).
334    #[inline]
335    pub fn views(&self) -> &Buffer<BinaryView> {
336        &self.views
337    }
338
339    /// Access value bytes at a given index
340    ///
341    /// Will return a bytebuffer pointing to the underlying data without performing a copy
342    #[inline]
343    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
344        let views = self.views();
345        let view = &views[index];
346        // Expect this to be the common case: strings > 12 bytes.
347        if !view.is_inlined() {
348            let view_ref = view.as_view();
349            self.buffer(view_ref.buffer_index() as usize)
350                .slice(view_ref.to_range())
351        } else {
352            // Return access to the range of bytes around it.
353            views
354                .clone()
355                .into_byte_buffer()
356                .slice_ref(view.as_inlined().value())
357        }
358    }
359
360    /// Access one of the backing data buffers.
361    ///
362    /// # Panics
363    ///
364    /// This method panics if the provided index is out of bounds for the set of buffers provided
365    /// at construction time.
366    #[inline]
367    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
368        if idx >= self.nbuffers() {
369            vortex_panic!(
370                "{idx} buffer index out of bounds, there are {} buffers",
371                self.nbuffers()
372            );
373        }
374        &self.buffers[idx]
375    }
376
377    /// Iterate over the underlying raw data buffers, not including the views buffer.
378    #[inline]
379    pub fn buffers(&self) -> &[ByteBuffer] {
380        &self.buffers
381    }
382
383    /// Accumulate an iterable set of values into our type here.
384    #[allow(clippy::same_name_method)]
385    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
386        iter: I,
387        dtype: DType,
388    ) -> Self {
389        match dtype {
390            DType::Utf8(nullability) => {
391                let string_view_array = generic_byte_view_builder::<StringViewType, _, _>(
392                    iter.into_iter(),
393                    |builder, v| {
394                        match v {
395                            None => builder.append_null(),
396                            Some(inner) => {
397                                // SAFETY: the caller must provide valid utf8 values if Utf8 DType is passed.
398                                let utf8 = unsafe { std::str::from_utf8_unchecked(inner.as_ref()) };
399                                builder.append_value(utf8);
400                            }
401                        }
402                    },
403                );
404                ArrayRef::from_arrow(&string_view_array, nullability.into())
405                    .to_varbinview()
406                    .vortex_expect("StringViewArray to VarBinViewArray downcast")
407            }
408            DType::Binary(nullability) => {
409                let binary_view_array = generic_byte_view_builder::<BinaryViewType, _, _>(
410                    iter.into_iter(),
411                    GenericByteViewBuilder::append_option,
412                );
413                ArrayRef::from_arrow(&binary_view_array, nullability.into())
414                    .to_varbinview()
415                    .vortex_expect("BinaryViewArray to VarBinViewArray downcast")
416            }
417            other => vortex_panic!("VarBinViewArray must be Utf8 or Binary, was {other}"),
418        }
419    }
420
421    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
422        let iter = iter.into_iter();
423        let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
424        for s in iter {
425            builder.append_value(s);
426        }
427        ArrayRef::from_arrow(&builder.finish(), false)
428            .to_varbinview()
429            .vortex_expect("VarBinViewArray from StringViewBuilder")
430    }
431
432    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
433        iter: I,
434    ) -> Self {
435        let iter = iter.into_iter();
436        let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
437        builder.extend(iter);
438
439        let array = ArrayRef::from_arrow(&builder.finish(), true);
440        array
441            .to_varbinview()
442            .vortex_expect("VarBinViewArray from StringViewBuilder")
443    }
444
445    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
446        let iter = iter.into_iter();
447        let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
448        for b in iter {
449            builder.append_value(b);
450        }
451        ArrayRef::from_arrow(&builder.finish(), false)
452            .to_varbinview()
453            .vortex_expect("VarBinViewArray from StringViewBuilder")
454    }
455
456    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
457        iter: I,
458    ) -> Self {
459        let iter = iter.into_iter();
460        let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
461        builder.extend(iter);
462        ArrayRef::from_arrow(&builder.finish(), true)
463            .to_varbinview()
464            .vortex_expect("VarBinViewArray from StringViewBuilder")
465    }
466}
467
468/// Generic helper to create an Arrow ByteViewBuilder of the appropriate type.
469fn generic_byte_view_builder<B, V, F>(
470    values: impl Iterator<Item = Option<V>>,
471    mut append_fn: F,
472) -> GenericByteViewArray<B>
473where
474    B: ByteViewType,
475    V: AsRef<[u8]>,
476    F: FnMut(&mut GenericByteViewBuilder<B>, Option<V>),
477{
478    let mut builder = GenericByteViewBuilder::<B>::new();
479
480    for value in values {
481        append_fn(&mut builder, value);
482    }
483
484    builder.finish()
485}
486
487impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
488    fn len(array: &VarBinViewArray) -> usize {
489        array.views.len()
490    }
491
492    fn dtype(array: &VarBinViewArray) -> &DType {
493        &array.dtype
494    }
495
496    fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
497        array.stats_set.to_ref(array.as_ref())
498    }
499}
500
501impl ValidityHelper for VarBinViewArray {
502    fn validity(&self) -> &Validity {
503        &self.validity
504    }
505}
506
507impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
508    fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
509        Ok(Canonical::VarBinView(array.clone()))
510    }
511
512    fn append_to_builder(
513        array: &VarBinViewArray,
514        builder: &mut dyn ArrayBuilder,
515    ) -> VortexResult<()> {
516        builder.extend_from_array(array.as_ref())
517    }
518}
519
520impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
521    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
522        Self::from_iter_nullable_bin(iter)
523    }
524}
525
526impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
527    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
528        Self::from_iter_nullable_bin(iter)
529    }
530}
531
532impl FromIterator<Option<String>> for VarBinViewArray {
533    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
534        Self::from_iter_nullable_str(iter)
535    }
536}
537
538impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
539    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
540        Self::from_iter_nullable_str(iter)
541    }
542}
543
544#[cfg(test)]
545mod test {
546    use vortex_scalar::Scalar;
547
548    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
549    use crate::{Array, Canonical, IntoArray};
550
551    #[test]
552    pub fn varbin_view() {
553        let binary_arr =
554            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
555        assert_eq!(binary_arr.len(), 2);
556        assert_eq!(
557            binary_arr.scalar_at(0).unwrap(),
558            Scalar::from("hello world")
559        );
560        assert_eq!(
561            binary_arr.scalar_at(1).unwrap(),
562            Scalar::from("hello world this is a long string")
563        );
564    }
565
566    #[test]
567    pub fn slice_array() {
568        let binary_arr =
569            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
570                .slice(1, 2)
571                .unwrap();
572        assert_eq!(
573            binary_arr.scalar_at(0).unwrap(),
574            Scalar::from("hello world this is a long string")
575        );
576    }
577
578    #[test]
579    pub fn flatten_array() {
580        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
581
582        let flattened = binary_arr.to_canonical().unwrap();
583        assert!(matches!(flattened, Canonical::VarBinView(_)));
584
585        let var_bin = flattened.into_varbinview().unwrap().into_array();
586        assert_eq!(var_bin.scalar_at(0).unwrap(), Scalar::from("string1"));
587        assert_eq!(var_bin.scalar_at(1).unwrap(), Scalar::from("string2"));
588    }
589
590    #[test]
591    pub fn binary_view_size_and_alignment() {
592        assert_eq!(size_of::<BinaryView>(), 16);
593        assert_eq!(align_of::<BinaryView>(), 16);
594    }
595}