vortex_array/arrays/varbinview/
mod.rs

1use std::fmt::{Debug, Formatter};
2use std::ops::Range;
3use std::sync::Arc;
4
5use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder};
6use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType};
7use arrow_array::{
8    ArrayRef as ArrowArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArray,
9};
10use arrow_buffer::ScalarBuffer;
11use static_assertions::{assert_eq_align, assert_eq_size};
12use vortex_buffer::{Alignment, Buffer, ByteBuffer};
13use vortex_dtype::DType;
14use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_panic};
15use vortex_mask::Mask;
16
17use crate::array::{ArrayCanonicalImpl, ArrayValidityImpl};
18use crate::arrow::FromArrowArray;
19use crate::builders::ArrayBuilder;
20use crate::stats::{ArrayStats, StatsSetRef};
21use crate::validity::Validity;
22use crate::vtable::{EncodingVTable, VTableRef};
23use crate::{
24    Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, Canonical, EmptyMetadata, Encoding,
25    EncodingId, TryFromArrayRef, try_from_array_ref,
26};
27
28mod accessor;
29mod compute;
30mod serde;
31mod stats;
32mod variants;
33
34#[derive(Clone, Copy, Debug, PartialEq, Eq)]
35#[repr(C, align(8))]
36pub struct Inlined {
37    size: u32,
38    data: [u8; BinaryView::MAX_INLINED_SIZE],
39}
40
41impl Inlined {
42    pub fn new(value: &[u8]) -> Self {
43        assert!(
44            value.len() <= BinaryView::MAX_INLINED_SIZE,
45            "Inlined strings must be shorter than 13 characters, {} given",
46            value.len()
47        );
48        let mut inlined = Self {
49            size: value.len().try_into().vortex_unwrap(),
50            data: [0u8; BinaryView::MAX_INLINED_SIZE],
51        };
52        inlined.data[..value.len()].copy_from_slice(value);
53        inlined
54    }
55
56    #[inline]
57    pub fn value(&self) -> &[u8] {
58        &self.data[0..(self.size as usize)]
59    }
60}
61
62#[derive(Clone, Copy, Debug)]
63#[repr(C, align(8))]
64pub struct Ref {
65    size: u32,
66    prefix: [u8; 4],
67    buffer_index: u32,
68    offset: u32,
69}
70
71impl Ref {
72    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
73        Self {
74            size,
75            prefix,
76            buffer_index,
77            offset,
78        }
79    }
80
81    #[inline]
82    pub fn buffer_index(&self) -> u32 {
83        self.buffer_index
84    }
85
86    #[inline]
87    pub fn offset(&self) -> u32 {
88        self.offset
89    }
90
91    #[inline]
92    pub fn prefix(&self) -> &[u8; 4] {
93        &self.prefix
94    }
95
96    #[inline]
97    pub fn to_range(&self) -> Range<usize> {
98        self.offset as usize..(self.offset + self.size) as usize
99    }
100}
101
102#[derive(Clone, Copy)]
103#[repr(C, align(16))]
104pub union BinaryView {
105    // Numeric representation. This is logically `u128`, but we split it into the high and low
106    // bits to preserve the alignment.
107    le_bytes: [u8; 16],
108
109    // Inlined representation: strings <= 12 bytes
110    inlined: Inlined,
111
112    // Reference type: strings > 12 bytes.
113    _ref: Ref,
114}
115
116assert_eq_size!(BinaryView, [u8; 16]);
117assert_eq_size!(Inlined, [u8; 16]);
118assert_eq_size!(Ref, [u8; 16]);
119assert_eq_align!(BinaryView, u128);
120
121impl BinaryView {
122    pub const MAX_INLINED_SIZE: usize = 12;
123
124    pub fn empty_view() -> Self {
125        Self {
126            inlined: Inlined::new(&[]),
127        }
128    }
129
130    pub fn new_inlined(value: &[u8]) -> Self {
131        assert!(
132            value.len() <= Self::MAX_INLINED_SIZE,
133            "expected inlined value to be <= 12 bytes, was {}",
134            value.len()
135        );
136
137        Self {
138            inlined: Inlined::new(value),
139        }
140    }
141
142    /// Create a new view over bytes stored in a block.
143    pub fn new_view(len: u32, prefix: [u8; 4], block: u32, offset: u32) -> Self {
144        Self {
145            _ref: Ref::new(len, prefix, block, offset),
146        }
147    }
148
149    #[inline]
150    pub fn len(&self) -> u32 {
151        unsafe { self.inlined.size }
152    }
153
154    #[inline]
155    pub fn is_empty(&self) -> bool {
156        self.len() > 0
157    }
158
159    #[inline]
160    #[allow(clippy::cast_possible_truncation)]
161    pub fn is_inlined(&self) -> bool {
162        self.len() <= (Self::MAX_INLINED_SIZE as u32)
163    }
164
165    pub fn as_inlined(&self) -> &Inlined {
166        unsafe { &self.inlined }
167    }
168
169    pub fn as_view(&self) -> &Ref {
170        unsafe { &self._ref }
171    }
172
173    pub fn as_u128(&self) -> u128 {
174        // SAFETY: binary view always safe to read as u128 LE bytes
175        unsafe { u128::from_le_bytes(self.le_bytes) }
176    }
177
178    /// Shifts the buffer reference by the view by a given offset, useful when merging many
179    /// varbinview arrays into one.
180    #[inline(always)]
181    pub fn offset_view(self, offset: u32) -> Self {
182        if self.is_inlined() {
183            self
184        } else {
185            // Referencing views must have their buffer_index adjusted with new offsets
186            let view_ref = self.as_view();
187            BinaryView::new_view(
188                self.len(),
189                *view_ref.prefix(),
190                offset + view_ref.buffer_index(),
191                view_ref.offset(),
192            )
193        }
194    }
195}
196
197impl From<u128> for BinaryView {
198    fn from(value: u128) -> Self {
199        BinaryView {
200            le_bytes: value.to_le_bytes(),
201        }
202    }
203}
204
205impl Debug for BinaryView {
206    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
207        let mut s = f.debug_struct("BinaryView");
208        if self.is_inlined() {
209            s.field("inline", &"i".to_string());
210        } else {
211            s.field("ref", &"r".to_string());
212        }
213        s.finish()
214    }
215}
216
217#[derive(Clone, Debug)]
218pub struct VarBinViewArray {
219    dtype: DType,
220    buffers: Vec<ByteBuffer>,
221    views: Buffer<BinaryView>,
222    validity: Validity,
223    stats_set: ArrayStats,
224}
225
226try_from_array_ref!(VarBinViewArray);
227
228pub struct VarBinViewEncoding;
229impl Encoding for VarBinViewEncoding {
230    type Array = VarBinViewArray;
231    type Metadata = EmptyMetadata;
232}
233
234impl EncodingVTable for VarBinViewEncoding {
235    fn id(&self) -> EncodingId {
236        EncodingId::new_ref("vortex.varbinview")
237    }
238}
239
240impl VarBinViewArray {
241    pub fn try_new(
242        views: Buffer<BinaryView>,
243        buffers: Vec<ByteBuffer>,
244        dtype: DType,
245        validity: Validity,
246    ) -> VortexResult<Self> {
247        if views.alignment() != Alignment::of::<BinaryView>() {
248            vortex_bail!("Views must be aligned to a 128 bits");
249        }
250
251        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
252            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
253        }
254
255        if dtype.is_nullable() == (validity == Validity::NonNullable) {
256            vortex_bail!("incorrect validity {:?}", validity);
257        }
258
259        Ok(Self {
260            dtype,
261            buffers,
262            views,
263            validity,
264            stats_set: Default::default(),
265        })
266    }
267
268    /// Number of raw string data buffers held by this array.
269    pub fn nbuffers(&self) -> usize {
270        self.buffers.len()
271    }
272
273    /// Access to the primitive views buffer.
274    ///
275    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
276    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
277    /// the string (if the string has 12 bytes or fewer).
278    #[inline]
279    pub fn views(&self) -> &Buffer<BinaryView> {
280        &self.views
281    }
282
283    /// Access value bytes at a given index
284    ///
285    /// Will return a bytebuffer pointing to the underlying data without performing a copy
286    #[inline]
287    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
288        let views = self.views();
289        let view = &views[index];
290        // Expect this to be the common case: strings > 12 bytes.
291        if !view.is_inlined() {
292            let view_ref = view.as_view();
293            self.buffer(view_ref.buffer_index() as usize)
294                .slice(view_ref.to_range())
295        } else {
296            // Return access to the range of bytes around it.
297            views
298                .clone()
299                .into_byte_buffer()
300                .slice_ref(view.as_inlined().value())
301        }
302    }
303
304    /// Access one of the backing data buffers.
305    ///
306    /// # Panics
307    ///
308    /// This method panics if the provided index is out of bounds for the set of buffers provided
309    /// at construction time.
310    #[inline]
311    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
312        if idx >= self.nbuffers() {
313            vortex_panic!(
314                "{idx} buffer index out of bounds, there are {} buffers",
315                self.nbuffers()
316            );
317        }
318        &self.buffers[idx]
319    }
320
321    /// Iterate over the underlying raw data buffers, not including the views buffer.
322    #[inline]
323    pub fn buffers(&self) -> &[ByteBuffer] {
324        &self.buffers
325    }
326
327    /// Validity of the array
328    pub fn validity(&self) -> &Validity {
329        &self.validity
330    }
331
332    /// Accumulate an iterable set of values into our type here.
333    #[allow(clippy::same_name_method)]
334    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
335        iter: I,
336        dtype: DType,
337    ) -> Self {
338        match dtype {
339            DType::Utf8(nullability) => {
340                let string_view_array = generic_byte_view_builder::<StringViewType, _, _>(
341                    iter.into_iter(),
342                    |builder, v| {
343                        match v {
344                            None => builder.append_null(),
345                            Some(inner) => {
346                                // SAFETY: the caller must provide valid utf8 values if Utf8 DType is passed.
347                                let utf8 = unsafe { std::str::from_utf8_unchecked(inner.as_ref()) };
348                                builder.append_value(utf8);
349                            }
350                        }
351                    },
352                );
353                VarBinViewArray::try_from_array(ArrayRef::from_arrow(
354                    &string_view_array,
355                    nullability.into(),
356                ))
357                .vortex_expect("StringViewArray to VarBinViewArray downcast")
358            }
359            DType::Binary(nullability) => {
360                let binary_view_array = generic_byte_view_builder::<BinaryViewType, _, _>(
361                    iter.into_iter(),
362                    GenericByteViewBuilder::append_option,
363                );
364                VarBinViewArray::try_from_array(ArrayRef::from_arrow(
365                    &binary_view_array,
366                    nullability.into(),
367                ))
368                .vortex_expect("BinaryViewArray to VarBinViewArray downcast")
369            }
370            other => vortex_panic!("VarBinViewArray must be Utf8 or Binary, was {other}"),
371        }
372    }
373
374    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
375        let iter = iter.into_iter();
376        let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
377        for s in iter {
378            builder.append_value(s);
379        }
380        let array = ArrayRef::from_arrow(&builder.finish(), false);
381        VarBinViewArray::try_from_array(array)
382            .vortex_expect("VarBinViewArray from StringViewBuilder")
383    }
384
385    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
386        iter: I,
387    ) -> Self {
388        let iter = iter.into_iter();
389        let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
390        builder.extend(iter);
391
392        let array = ArrayRef::from_arrow(&builder.finish(), true);
393        VarBinViewArray::try_from_array(array)
394            .vortex_expect("VarBinViewArray from StringViewBuilder")
395    }
396
397    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
398        let iter = iter.into_iter();
399        let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
400        for b in iter {
401            builder.append_value(b);
402        }
403        let array = ArrayRef::from_arrow(&builder.finish(), false);
404        VarBinViewArray::try_from_array(array)
405            .vortex_expect("VarBinViewArray from StringViewBuilder")
406    }
407
408    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
409        iter: I,
410    ) -> Self {
411        let iter = iter.into_iter();
412        let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
413        builder.extend(iter);
414        let array = ArrayRef::from_arrow(&builder.finish(), true);
415        VarBinViewArray::try_from_array(array)
416            .vortex_expect("VarBinViewArray from StringViewBuilder")
417    }
418}
419
420// Generic helper to create an Arrow ByteViewBuilder of the appropriate type.
421fn generic_byte_view_builder<B, V, F>(
422    values: impl Iterator<Item = Option<V>>,
423    mut append_fn: F,
424) -> GenericByteViewArray<B>
425where
426    B: ByteViewType,
427    V: AsRef<[u8]>,
428    F: FnMut(&mut GenericByteViewBuilder<B>, Option<V>),
429{
430    let mut builder = GenericByteViewBuilder::<B>::new();
431
432    for value in values {
433        append_fn(&mut builder, value);
434    }
435
436    builder.finish()
437}
438
439impl ArrayImpl for VarBinViewArray {
440    type Encoding = VarBinViewEncoding;
441
442    fn _len(&self) -> usize {
443        self.views.len()
444    }
445
446    fn _dtype(&self) -> &DType {
447        &self.dtype
448    }
449
450    fn _vtable(&self) -> VTableRef {
451        VTableRef::new_ref(&VarBinViewEncoding)
452    }
453}
454
455impl ArrayStatisticsImpl for VarBinViewArray {
456    fn _stats_ref(&self) -> StatsSetRef<'_> {
457        self.stats_set.to_ref(self)
458    }
459}
460
461impl ArrayCanonicalImpl for VarBinViewArray {
462    fn _to_canonical(&self) -> VortexResult<Canonical> {
463        Ok(Canonical::VarBinView(self.clone()))
464    }
465
466    fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
467        builder.extend_from_array(self)
468    }
469}
470
471pub(crate) fn varbinview_as_arrow(var_bin_view: &VarBinViewArray) -> ArrowArrayRef {
472    let views = var_bin_view.views().clone();
473
474    let nulls = var_bin_view
475        .validity_mask()
476        .vortex_expect("VarBinViewArray: failed to get logical validity")
477        .to_null_buffer();
478
479    let data = (0..var_bin_view.nbuffers())
480        .map(|i| var_bin_view.buffer(i))
481        .collect::<Vec<_>>();
482
483    let data = data
484        .into_iter()
485        .map(|p| p.clone().into_arrow_buffer())
486        .collect::<Vec<_>>();
487
488    // Switch on Arrow DType.
489    match var_bin_view.dtype() {
490        DType::Binary(_) => Arc::new(unsafe {
491            BinaryViewArray::new_unchecked(
492                ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
493                data,
494                nulls,
495            )
496        }),
497        DType::Utf8(_) => Arc::new(unsafe {
498            StringViewArray::new_unchecked(
499                ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
500                data,
501                nulls,
502            )
503        }),
504        _ => vortex_panic!("expected utf8 or binary, got {}", var_bin_view.dtype()),
505    }
506}
507
508impl ArrayValidityImpl for VarBinViewArray {
509    fn _is_valid(&self, index: usize) -> VortexResult<bool> {
510        self.validity.is_valid(index)
511    }
512
513    fn _all_valid(&self) -> VortexResult<bool> {
514        self.validity.all_valid()
515    }
516
517    fn _all_invalid(&self) -> VortexResult<bool> {
518        self.validity.all_invalid()
519    }
520
521    fn _validity_mask(&self) -> VortexResult<Mask> {
522        self.validity.to_logical(self.len())
523    }
524}
525
526impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
527    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
528        Self::from_iter_nullable_bin(iter)
529    }
530}
531
532impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
533    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
534        Self::from_iter_nullable_bin(iter)
535    }
536}
537
538impl FromIterator<Option<String>> for VarBinViewArray {
539    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
540        Self::from_iter_nullable_str(iter)
541    }
542}
543
544impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
545    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
546        Self::from_iter_nullable_str(iter)
547    }
548}
549
550#[cfg(test)]
551mod test {
552    use vortex_scalar::Scalar;
553
554    use crate::Canonical;
555    use crate::array::Array;
556    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
557    use crate::compute::{scalar_at, slice};
558
559    #[test]
560    pub fn varbin_view() {
561        let binary_arr =
562            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
563        assert_eq!(binary_arr.len(), 2);
564        assert_eq!(
565            scalar_at(&binary_arr, 0).unwrap(),
566            Scalar::from("hello world")
567        );
568        assert_eq!(
569            scalar_at(&binary_arr, 1).unwrap(),
570            Scalar::from("hello world this is a long string")
571        );
572    }
573
574    #[test]
575    pub fn slice_array() {
576        let binary_arr = slice(
577            &VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]),
578            1,
579            2,
580        )
581        .unwrap();
582        assert_eq!(
583            scalar_at(&binary_arr, 0).unwrap(),
584            Scalar::from("hello world this is a long string")
585        );
586    }
587
588    #[test]
589    pub fn flatten_array() {
590        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
591
592        let flattened = binary_arr.to_canonical().unwrap();
593        assert!(matches!(flattened, Canonical::VarBinView(_)));
594
595        let var_bin = flattened.into_varbinview().unwrap().into_array();
596        assert_eq!(scalar_at(&var_bin, 0).unwrap(), Scalar::from("string1"));
597        assert_eq!(scalar_at(&var_bin, 1).unwrap(), Scalar::from("string2"));
598    }
599
600    #[test]
601    pub fn binary_view_size_and_alignment() {
602        assert_eq!(size_of::<BinaryView>(), 16);
603        assert_eq!(align_of::<BinaryView>(), 16);
604    }
605}