vortex_array/arrays/varbinview/
mod.rs

1use std::fmt::{Debug, Formatter};
2use std::ops::Range;
3use std::sync::Arc;
4
5use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder};
6use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType};
7use arrow_array::{
8    ArrayRef as ArrowArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArray,
9};
10use arrow_buffer::ScalarBuffer;
11use static_assertions::{assert_eq_align, assert_eq_size};
12use vortex_buffer::{Alignment, Buffer, ByteBuffer};
13use vortex_dtype::DType;
14use vortex_error::{
15    VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_err, vortex_panic,
16};
17use vortex_mask::Mask;
18
19use crate::array::{ArrayCanonicalImpl, ArrayValidityImpl};
20use crate::arrow::FromArrowArray;
21use crate::builders::ArrayBuilder;
22use crate::stats::{ArrayStats, StatsSetRef};
23use crate::validity::Validity;
24use crate::vtable::VTableRef;
25use crate::{
26    Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, Canonical, EmptyMetadata, Encoding,
27    TryFromArrayRef, try_from_array_ref,
28};
29
30mod accessor;
31mod compute;
32mod serde;
33mod variants;
34
35#[derive(Clone, Copy, Debug, PartialEq, Eq)]
36#[repr(C, align(8))]
37pub struct Inlined {
38    size: u32,
39    data: [u8; BinaryView::MAX_INLINED_SIZE],
40}
41
42impl Inlined {
43    fn new<const N: usize>(value: &[u8]) -> Self {
44        let mut inlined = Self {
45            size: N.try_into().vortex_unwrap(),
46            data: [0u8; BinaryView::MAX_INLINED_SIZE],
47        };
48        inlined.data[..N].copy_from_slice(&value[..N]);
49        inlined
50    }
51
52    #[inline]
53    pub fn value(&self) -> &[u8] {
54        &self.data[0..(self.size as usize)]
55    }
56}
57
58#[derive(Clone, Copy, Debug)]
59#[repr(C, align(8))]
60pub struct Ref {
61    size: u32,
62    prefix: [u8; 4],
63    buffer_index: u32,
64    offset: u32,
65}
66
67impl Ref {
68    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
69        Self {
70            size,
71            prefix,
72            buffer_index,
73            offset,
74        }
75    }
76
77    #[inline]
78    pub fn buffer_index(&self) -> u32 {
79        self.buffer_index
80    }
81
82    #[inline]
83    pub fn offset(&self) -> u32 {
84        self.offset
85    }
86
87    #[inline]
88    pub fn prefix(&self) -> &[u8; 4] {
89        &self.prefix
90    }
91
92    #[inline]
93    pub fn to_range(&self) -> Range<usize> {
94        self.offset as usize..(self.offset + self.size) as usize
95    }
96}
97
98#[derive(Clone, Copy)]
99#[repr(C, align(16))]
100pub union BinaryView {
101    // Numeric representation. This is logically `u128`, but we split it into the high and low
102    // bits to preserve the alignment.
103    le_bytes: [u8; 16],
104
105    // Inlined representation: strings <= 12 bytes
106    inlined: Inlined,
107
108    // Reference type: strings > 12 bytes.
109    _ref: Ref,
110}
111
112assert_eq_size!(BinaryView, [u8; 16]);
113assert_eq_size!(Inlined, [u8; 16]);
114assert_eq_size!(Ref, [u8; 16]);
115assert_eq_align!(BinaryView, u128);
116
117impl BinaryView {
118    pub const MAX_INLINED_SIZE: usize = 12;
119
120    /// Create a view from a value, block and offset
121    ///
122    /// Depending on the length of the provided value either a new inlined
123    /// or a reference view will be constructed.
124    ///
125    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
126    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
127    #[inline(never)]
128    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
129        match value.len() {
130            0 => Self {
131                inlined: Inlined::new::<0>(value),
132            },
133            1 => Self {
134                inlined: Inlined::new::<1>(value),
135            },
136            2 => Self {
137                inlined: Inlined::new::<2>(value),
138            },
139            3 => Self {
140                inlined: Inlined::new::<3>(value),
141            },
142            4 => Self {
143                inlined: Inlined::new::<4>(value),
144            },
145            5 => Self {
146                inlined: Inlined::new::<5>(value),
147            },
148            6 => Self {
149                inlined: Inlined::new::<6>(value),
150            },
151            7 => Self {
152                inlined: Inlined::new::<7>(value),
153            },
154            8 => Self {
155                inlined: Inlined::new::<8>(value),
156            },
157            9 => Self {
158                inlined: Inlined::new::<9>(value),
159            },
160            10 => Self {
161                inlined: Inlined::new::<10>(value),
162            },
163            11 => Self {
164                inlined: Inlined::new::<11>(value),
165            },
166            12 => Self {
167                inlined: Inlined::new::<12>(value),
168            },
169            _ => Self {
170                _ref: Ref::new(
171                    u32::try_from(value.len()).vortex_unwrap(),
172                    value[0..4].try_into().vortex_unwrap(),
173                    block,
174                    offset,
175                ),
176            },
177        }
178    }
179
180    /// Create a new empty view
181    #[inline]
182    pub fn empty_view() -> Self {
183        Self::new_inlined(&[])
184    }
185
186    /// Create a new inlined binary view
187    #[inline]
188    pub fn new_inlined(value: &[u8]) -> Self {
189        assert!(
190            value.len() <= Self::MAX_INLINED_SIZE,
191            "expected inlined value to be <= 12 bytes, was {}",
192            value.len()
193        );
194
195        Self::make_view(value, 0, 0)
196    }
197
198    #[inline]
199    pub fn len(&self) -> u32 {
200        unsafe { self.inlined.size }
201    }
202
203    #[inline]
204    pub fn is_empty(&self) -> bool {
205        self.len() > 0
206    }
207
208    #[inline]
209    #[allow(clippy::cast_possible_truncation)]
210    pub fn is_inlined(&self) -> bool {
211        self.len() <= (Self::MAX_INLINED_SIZE as u32)
212    }
213
214    pub fn as_inlined(&self) -> &Inlined {
215        unsafe { &self.inlined }
216    }
217
218    pub fn as_view(&self) -> &Ref {
219        unsafe { &self._ref }
220    }
221
222    pub fn as_u128(&self) -> u128 {
223        // SAFETY: binary view always safe to read as u128 LE bytes
224        unsafe { u128::from_le_bytes(self.le_bytes) }
225    }
226
227    /// Shifts the buffer reference by the view by a given offset, useful when merging many
228    /// varbinview arrays into one.
229    #[inline(always)]
230    pub fn offset_view(self, offset: u32) -> Self {
231        if self.is_inlined() {
232            self
233        } else {
234            // Referencing views must have their buffer_index adjusted with new offsets
235            let view_ref = self.as_view();
236            Self {
237                _ref: Ref::new(
238                    self.len(),
239                    *view_ref.prefix(),
240                    offset + view_ref.buffer_index(),
241                    view_ref.offset(),
242                ),
243            }
244        }
245    }
246}
247
248impl From<u128> for BinaryView {
249    fn from(value: u128) -> Self {
250        BinaryView {
251            le_bytes: value.to_le_bytes(),
252        }
253    }
254}
255
256impl Debug for BinaryView {
257    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
258        let mut s = f.debug_struct("BinaryView");
259        if self.is_inlined() {
260            s.field("inline", &"i".to_string());
261        } else {
262            s.field("ref", &"r".to_string());
263        }
264        s.finish()
265    }
266}
267
268#[derive(Clone, Debug)]
269pub struct VarBinViewArray {
270    dtype: DType,
271    buffers: Vec<ByteBuffer>,
272    views: Buffer<BinaryView>,
273    validity: Validity,
274    stats_set: ArrayStats,
275}
276
277try_from_array_ref!(VarBinViewArray);
278
279#[derive(Debug)]
280pub struct VarBinViewEncoding;
281impl Encoding for VarBinViewEncoding {
282    type Array = VarBinViewArray;
283    type Metadata = EmptyMetadata;
284}
285
286impl VarBinViewArray {
287    pub fn try_new(
288        views: Buffer<BinaryView>,
289        buffers: Vec<ByteBuffer>,
290        dtype: DType,
291        validity: Validity,
292    ) -> VortexResult<Self> {
293        if views.alignment() != Alignment::of::<BinaryView>() {
294            vortex_bail!("Views must be aligned to a 128 bits");
295        }
296
297        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
298            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
299        }
300
301        if dtype.is_nullable() == (validity == Validity::NonNullable) {
302            vortex_bail!("incorrect validity {:?}", validity);
303        }
304
305        Ok(Self {
306            dtype,
307            buffers,
308            views,
309            validity,
310            stats_set: Default::default(),
311        })
312    }
313
314    /// Number of raw string data buffers held by this array.
315    pub fn nbuffers(&self) -> usize {
316        self.buffers.len()
317    }
318
319    /// Access to the primitive views buffer.
320    ///
321    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
322    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
323    /// the string (if the string has 12 bytes or fewer).
324    #[inline]
325    pub fn views(&self) -> &Buffer<BinaryView> {
326        &self.views
327    }
328
329    /// Access value bytes at a given index
330    ///
331    /// Will return a bytebuffer pointing to the underlying data without performing a copy
332    #[inline]
333    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
334        let views = self.views();
335        let view = &views[index];
336        // Expect this to be the common case: strings > 12 bytes.
337        if !view.is_inlined() {
338            let view_ref = view.as_view();
339            self.buffer(view_ref.buffer_index() as usize)
340                .slice(view_ref.to_range())
341        } else {
342            // Return access to the range of bytes around it.
343            views
344                .clone()
345                .into_byte_buffer()
346                .slice_ref(view.as_inlined().value())
347        }
348    }
349
350    /// Access one of the backing data buffers.
351    ///
352    /// # Panics
353    ///
354    /// This method panics if the provided index is out of bounds for the set of buffers provided
355    /// at construction time.
356    #[inline]
357    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
358        if idx >= self.nbuffers() {
359            vortex_panic!(
360                "{idx} buffer index out of bounds, there are {} buffers",
361                self.nbuffers()
362            );
363        }
364        &self.buffers[idx]
365    }
366
367    /// Iterate over the underlying raw data buffers, not including the views buffer.
368    #[inline]
369    pub fn buffers(&self) -> &[ByteBuffer] {
370        &self.buffers
371    }
372
373    /// Validity of the array
374    pub fn validity(&self) -> &Validity {
375        &self.validity
376    }
377
378    /// Accumulate an iterable set of values into our type here.
379    #[allow(clippy::same_name_method)]
380    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
381        iter: I,
382        dtype: DType,
383    ) -> Self {
384        match dtype {
385            DType::Utf8(nullability) => {
386                let string_view_array = generic_byte_view_builder::<StringViewType, _, _>(
387                    iter.into_iter(),
388                    |builder, v| {
389                        match v {
390                            None => builder.append_null(),
391                            Some(inner) => {
392                                // SAFETY: the caller must provide valid utf8 values if Utf8 DType is passed.
393                                let utf8 = unsafe { std::str::from_utf8_unchecked(inner.as_ref()) };
394                                builder.append_value(utf8);
395                            }
396                        }
397                    },
398                );
399                VarBinViewArray::try_from_array(ArrayRef::from_arrow(
400                    &string_view_array,
401                    nullability.into(),
402                ))
403                .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
404                .vortex_expect("StringViewArray to VarBinViewArray downcast")
405            }
406            DType::Binary(nullability) => {
407                let binary_view_array = generic_byte_view_builder::<BinaryViewType, _, _>(
408                    iter.into_iter(),
409                    GenericByteViewBuilder::append_option,
410                );
411                VarBinViewArray::try_from_array(ArrayRef::from_arrow(
412                    &binary_view_array,
413                    nullability.into(),
414                ))
415                .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
416                .vortex_expect("BinaryViewArray to VarBinViewArray downcast")
417            }
418            other => vortex_panic!("VarBinViewArray must be Utf8 or Binary, was {other}"),
419        }
420    }
421
422    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
423        let iter = iter.into_iter();
424        let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
425        for s in iter {
426            builder.append_value(s);
427        }
428        let array = ArrayRef::from_arrow(&builder.finish(), false);
429        VarBinViewArray::try_from_array(array)
430            .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
431            .vortex_expect("VarBinViewArray from StringViewBuilder")
432    }
433
434    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
435        iter: I,
436    ) -> Self {
437        let iter = iter.into_iter();
438        let mut builder = StringViewBuilder::with_capacity(iter.size_hint().0);
439        builder.extend(iter);
440
441        let array = ArrayRef::from_arrow(&builder.finish(), true);
442        VarBinViewArray::try_from_array(array)
443            .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
444            .vortex_expect("VarBinViewArray from StringViewBuilder")
445    }
446
447    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
448        let iter = iter.into_iter();
449        let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
450        for b in iter {
451            builder.append_value(b);
452        }
453        let array = ArrayRef::from_arrow(&builder.finish(), false);
454        VarBinViewArray::try_from_array(array)
455            .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
456            .vortex_expect("VarBinViewArray from StringViewBuilder")
457    }
458
459    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
460        iter: I,
461    ) -> Self {
462        let iter = iter.into_iter();
463        let mut builder = BinaryViewBuilder::with_capacity(iter.size_hint().0);
464        builder.extend(iter);
465        let array = ArrayRef::from_arrow(&builder.finish(), true);
466        VarBinViewArray::try_from_array(array)
467            .map_err(|_| vortex_err!("Array was not a VarBinViewArray"))
468            .vortex_expect("VarBinViewArray from StringViewBuilder")
469    }
470}
471
472// Generic helper to create an Arrow ByteViewBuilder of the appropriate type.
473fn generic_byte_view_builder<B, V, F>(
474    values: impl Iterator<Item = Option<V>>,
475    mut append_fn: F,
476) -> GenericByteViewArray<B>
477where
478    B: ByteViewType,
479    V: AsRef<[u8]>,
480    F: FnMut(&mut GenericByteViewBuilder<B>, Option<V>),
481{
482    let mut builder = GenericByteViewBuilder::<B>::new();
483
484    for value in values {
485        append_fn(&mut builder, value);
486    }
487
488    builder.finish()
489}
490
491impl ArrayImpl for VarBinViewArray {
492    type Encoding = VarBinViewEncoding;
493
494    fn _len(&self) -> usize {
495        self.views.len()
496    }
497
498    fn _dtype(&self) -> &DType {
499        &self.dtype
500    }
501
502    fn _vtable(&self) -> VTableRef {
503        VTableRef::new_ref(&VarBinViewEncoding)
504    }
505
506    fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
507        let mut this = self.clone();
508
509        if let Validity::Array(array) = &mut this.validity {
510            *array = children[0].clone();
511        }
512
513        Ok(this)
514    }
515}
516
517impl ArrayStatisticsImpl for VarBinViewArray {
518    fn _stats_ref(&self) -> StatsSetRef<'_> {
519        self.stats_set.to_ref(self)
520    }
521}
522
523impl ArrayCanonicalImpl for VarBinViewArray {
524    fn _to_canonical(&self) -> VortexResult<Canonical> {
525        Ok(Canonical::VarBinView(self.clone()))
526    }
527
528    fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
529        builder.extend_from_array(self)
530    }
531}
532
533pub(crate) fn varbinview_as_arrow(var_bin_view: &VarBinViewArray) -> ArrowArrayRef {
534    let views = var_bin_view.views().clone();
535
536    let nulls = var_bin_view
537        .validity_mask()
538        .vortex_expect("VarBinViewArray: failed to get logical validity")
539        .to_null_buffer();
540
541    let data = (0..var_bin_view.nbuffers())
542        .map(|i| var_bin_view.buffer(i))
543        .collect::<Vec<_>>();
544
545    let data = data
546        .into_iter()
547        .map(|p| p.clone().into_arrow_buffer())
548        .collect::<Vec<_>>();
549
550    // Switch on Arrow DType.
551    match var_bin_view.dtype() {
552        DType::Binary(_) => Arc::new(unsafe {
553            BinaryViewArray::new_unchecked(
554                ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
555                data,
556                nulls,
557            )
558        }),
559        DType::Utf8(_) => Arc::new(unsafe {
560            StringViewArray::new_unchecked(
561                ScalarBuffer::<u128>::from(views.into_byte_buffer().into_arrow_buffer()),
562                data,
563                nulls,
564            )
565        }),
566        _ => vortex_panic!("expected utf8 or binary, got {}", var_bin_view.dtype()),
567    }
568}
569
570impl ArrayValidityImpl for VarBinViewArray {
571    fn _is_valid(&self, index: usize) -> VortexResult<bool> {
572        self.validity.is_valid(index)
573    }
574
575    fn _all_valid(&self) -> VortexResult<bool> {
576        self.validity.all_valid()
577    }
578
579    fn _all_invalid(&self) -> VortexResult<bool> {
580        self.validity.all_invalid()
581    }
582
583    fn _validity_mask(&self) -> VortexResult<Mask> {
584        self.validity.to_mask(self.len())
585    }
586}
587
588impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
589    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
590        Self::from_iter_nullable_bin(iter)
591    }
592}
593
594impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
595    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
596        Self::from_iter_nullable_bin(iter)
597    }
598}
599
600impl FromIterator<Option<String>> for VarBinViewArray {
601    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
602        Self::from_iter_nullable_str(iter)
603    }
604}
605
606impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
607    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
608        Self::from_iter_nullable_str(iter)
609    }
610}
611
612#[cfg(test)]
613mod test {
614    use vortex_scalar::Scalar;
615
616    use crate::Canonical;
617    use crate::array::Array;
618    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
619    use crate::compute::{scalar_at, slice};
620
621    #[test]
622    pub fn varbin_view() {
623        let binary_arr =
624            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
625        assert_eq!(binary_arr.len(), 2);
626        assert_eq!(
627            scalar_at(&binary_arr, 0).unwrap(),
628            Scalar::from("hello world")
629        );
630        assert_eq!(
631            scalar_at(&binary_arr, 1).unwrap(),
632            Scalar::from("hello world this is a long string")
633        );
634    }
635
636    #[test]
637    pub fn slice_array() {
638        let binary_arr = slice(
639            &VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]),
640            1,
641            2,
642        )
643        .unwrap();
644        assert_eq!(
645            scalar_at(&binary_arr, 0).unwrap(),
646            Scalar::from("hello world this is a long string")
647        );
648    }
649
650    #[test]
651    pub fn flatten_array() {
652        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
653
654        let flattened = binary_arr.to_canonical().unwrap();
655        assert!(matches!(flattened, Canonical::VarBinView(_)));
656
657        let var_bin = flattened.into_varbinview().unwrap().into_array();
658        assert_eq!(scalar_at(&var_bin, 0).unwrap(), Scalar::from("string1"));
659        assert_eq!(scalar_at(&var_bin, 1).unwrap(), Scalar::from("string2"));
660    }
661
662    #[test]
663    pub fn binary_view_size_and_alignment() {
664        assert_eq!(size_of::<BinaryView>(), 16);
665        assert_eq!(align_of::<BinaryView>(), 16);
666    }
667}