vortex_array/arrays/varbinview/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::{Debug, Formatter};
5use std::ops::Range;
6use std::sync::Arc;
7
8use static_assertions::{assert_eq_align, assert_eq_size};
9use vortex_buffer::{Buffer, ByteBuffer};
10use vortex_dtype::{DType, Nullability};
11use vortex_error::{
12    VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
13};
14
15use crate::builders::{ArrayBuilder, VarBinViewBuilder};
16use crate::stats::{ArrayStats, StatsSetRef};
17use crate::validity::Validity;
18use crate::vtable::{
19    ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
20    ValidityVTableFromValidityHelper,
21};
22use crate::{Canonical, EncodingId, EncodingRef, vtable};
23
24mod accessor;
25mod compact;
26mod compute;
27mod ops;
28mod serde;
29
30#[derive(Clone, Copy, Debug, PartialEq, Eq)]
31#[repr(C, align(8))]
32pub struct Inlined {
33    size: u32,
34    data: [u8; BinaryView::MAX_INLINED_SIZE],
35}
36
37impl Inlined {
38    fn new<const N: usize>(value: &[u8]) -> Self {
39        let mut inlined = Self {
40            size: N.try_into().vortex_unwrap(),
41            data: [0u8; BinaryView::MAX_INLINED_SIZE],
42        };
43        inlined.data[..N].copy_from_slice(&value[..N]);
44        inlined
45    }
46
47    #[inline]
48    pub fn value(&self) -> &[u8] {
49        &self.data[0..(self.size as usize)]
50    }
51}
52
53#[derive(Clone, Copy, Debug)]
54#[repr(C, align(8))]
55pub struct Ref {
56    size: u32,
57    prefix: [u8; 4],
58    buffer_index: u32,
59    offset: u32,
60}
61
62impl Ref {
63    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
64        Self {
65            size,
66            prefix,
67            buffer_index,
68            offset,
69        }
70    }
71
72    #[inline]
73    pub fn buffer_index(&self) -> u32 {
74        self.buffer_index
75    }
76
77    #[inline]
78    pub fn offset(&self) -> u32 {
79        self.offset
80    }
81
82    #[inline]
83    pub fn prefix(&self) -> &[u8; 4] {
84        &self.prefix
85    }
86
87    #[inline]
88    pub fn to_range(&self) -> Range<usize> {
89        self.offset as usize..(self.offset + self.size) as usize
90    }
91}
92
93#[derive(Clone, Copy)]
94#[repr(C, align(16))]
95pub union BinaryView {
96    // Numeric representation. This is logically `u128`, but we split it into the high and low
97    // bits to preserve the alignment.
98    le_bytes: [u8; 16],
99
100    // Inlined representation: strings <= 12 bytes
101    inlined: Inlined,
102
103    // Reference type: strings > 12 bytes.
104    _ref: Ref,
105}
106
107assert_eq_size!(BinaryView, [u8; 16]);
108assert_eq_size!(Inlined, [u8; 16]);
109assert_eq_size!(Ref, [u8; 16]);
110assert_eq_align!(BinaryView, u128);
111
112impl BinaryView {
113    pub const MAX_INLINED_SIZE: usize = 12;
114
115    /// Create a view from a value, block and offset
116    ///
117    /// Depending on the length of the provided value either a new inlined
118    /// or a reference view will be constructed.
119    ///
120    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
121    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
122    #[inline(never)]
123    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
124        match value.len() {
125            0 => Self {
126                inlined: Inlined::new::<0>(value),
127            },
128            1 => Self {
129                inlined: Inlined::new::<1>(value),
130            },
131            2 => Self {
132                inlined: Inlined::new::<2>(value),
133            },
134            3 => Self {
135                inlined: Inlined::new::<3>(value),
136            },
137            4 => Self {
138                inlined: Inlined::new::<4>(value),
139            },
140            5 => Self {
141                inlined: Inlined::new::<5>(value),
142            },
143            6 => Self {
144                inlined: Inlined::new::<6>(value),
145            },
146            7 => Self {
147                inlined: Inlined::new::<7>(value),
148            },
149            8 => Self {
150                inlined: Inlined::new::<8>(value),
151            },
152            9 => Self {
153                inlined: Inlined::new::<9>(value),
154            },
155            10 => Self {
156                inlined: Inlined::new::<10>(value),
157            },
158            11 => Self {
159                inlined: Inlined::new::<11>(value),
160            },
161            12 => Self {
162                inlined: Inlined::new::<12>(value),
163            },
164            _ => Self {
165                _ref: Ref::new(
166                    u32::try_from(value.len()).vortex_unwrap(),
167                    value[0..4].try_into().vortex_unwrap(),
168                    block,
169                    offset,
170                ),
171            },
172        }
173    }
174
175    /// Create a new empty view
176    #[inline]
177    pub fn empty_view() -> Self {
178        Self::new_inlined(&[])
179    }
180
181    /// Create a new inlined binary view
182    #[inline]
183    pub fn new_inlined(value: &[u8]) -> Self {
184        assert!(
185            value.len() <= Self::MAX_INLINED_SIZE,
186            "expected inlined value to be <= 12 bytes, was {}",
187            value.len()
188        );
189
190        Self::make_view(value, 0, 0)
191    }
192
193    #[inline]
194    pub fn len(&self) -> u32 {
195        unsafe { self.inlined.size }
196    }
197
198    #[inline]
199    pub fn is_empty(&self) -> bool {
200        self.len() > 0
201    }
202
203    #[inline]
204    #[allow(clippy::cast_possible_truncation)]
205    pub fn is_inlined(&self) -> bool {
206        self.len() <= (Self::MAX_INLINED_SIZE as u32)
207    }
208
209    pub fn as_inlined(&self) -> &Inlined {
210        unsafe { &self.inlined }
211    }
212
213    pub fn as_view(&self) -> &Ref {
214        unsafe { &self._ref }
215    }
216
217    pub fn as_u128(&self) -> u128 {
218        // SAFETY: binary view always safe to read as u128 LE bytes
219        unsafe { u128::from_le_bytes(self.le_bytes) }
220    }
221
222    /// Override the buffer reference with the given buffer_idx, only if this view is not inlined.
223    #[inline(always)]
224    pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
225        if self.is_inlined() {
226            self
227        } else {
228            // Referencing views must have their buffer_index adjusted with new offsets
229            let view_ref = self.as_view();
230            Self {
231                _ref: Ref::new(
232                    self.len(),
233                    *view_ref.prefix(),
234                    buffer_idx,
235                    view_ref.offset(),
236                ),
237            }
238        }
239    }
240
241    /// Shifts the buffer reference by the view by a given offset, useful when merging many
242    /// varbinview arrays into one.
243    #[inline(always)]
244    pub fn offset_view(self, offset: u32) -> Self {
245        if self.is_inlined() {
246            self
247        } else {
248            // Referencing views must have their buffer_index adjusted with new offsets
249            let view_ref = self.as_view();
250            Self {
251                _ref: Ref::new(
252                    self.len(),
253                    *view_ref.prefix(),
254                    offset + view_ref.buffer_index(),
255                    view_ref.offset(),
256                ),
257            }
258        }
259    }
260}
261
262impl From<u128> for BinaryView {
263    fn from(value: u128) -> Self {
264        BinaryView {
265            le_bytes: value.to_le_bytes(),
266        }
267    }
268}
269
270impl Debug for BinaryView {
271    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
272        let mut s = f.debug_struct("BinaryView");
273        if self.is_inlined() {
274            s.field("inline", &"i".to_string());
275        } else {
276            s.field("ref", &"r".to_string());
277        }
278        s.finish()
279    }
280}
281
282vtable!(VarBinView);
283
284impl VTable for VarBinViewVTable {
285    type Array = VarBinViewArray;
286    type Encoding = VarBinViewEncoding;
287
288    type ArrayVTable = Self;
289    type CanonicalVTable = Self;
290    type OperationsVTable = Self;
291    type ValidityVTable = ValidityVTableFromValidityHelper;
292    type VisitorVTable = Self;
293    type ComputeVTable = NotSupported;
294    type EncodeVTable = NotSupported;
295    type SerdeVTable = Self;
296
297    fn id(_encoding: &Self::Encoding) -> EncodingId {
298        EncodingId::new_ref("vortex.varbinview")
299    }
300
301    fn encoding(_array: &Self::Array) -> EncodingRef {
302        EncodingRef::new_ref(VarBinViewEncoding.as_ref())
303    }
304}
305
306/// A variable-length binary view array that stores strings and binary data efficiently.
307///
308/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
309/// an optimized representation for variable-length data with excellent performance
310/// characteristics for both short and long strings.
311///
312/// ## Data Layout
313///
314/// The array uses a hybrid storage approach with two main components:
315/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
316/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
317///
318/// ## View Structure
319///
320/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
321/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
322/// - **Reference data**: For strings > 12 bytes, contains:
323///   - String length (4 bytes)
324///   - First 4 bytes of string as prefix (4 bytes)
325///   - Buffer index and offset (8 bytes total)
326///
327/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
328///
329/// ```text
330///                         ┌──────┬────────────────────────┐
331///                         │length│      string value      │
332///    Strings (len <= 12)  │      │    (padded with 0)     │
333///                         └──────┴────────────────────────┘
334///                          0    31                      127
335///
336///                         ┌───────┬───────┬───────┬───────┐
337///                         │length │prefix │  buf  │offset │
338///    Strings (len > 12)   │       │       │ index │       │
339///                         └───────┴───────┴───────┴───────┘
340///                          0    31       63      95    127
341/// ```
342///
343/// # Examples
344///
345/// ```
346/// use vortex_array::arrays::VarBinViewArray;
347/// use vortex_dtype::{DType, Nullability};
348/// use vortex_array::IntoArray;
349///
350/// // Create from an Iterator<Item = &str>
351/// let array = VarBinViewArray::from_iter_str([
352///         "inlined",
353///         "this string is outlined"
354/// ]);
355///
356/// assert_eq!(array.len(), 2);
357///
358/// // Access individual strings
359/// let first = array.bytes_at(0);
360/// assert_eq!(first.as_slice(), b"inlined"); // "short"
361///
362/// let second = array.bytes_at(1);
363/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
364/// ```
365#[derive(Clone, Debug)]
366pub struct VarBinViewArray {
367    dtype: DType,
368    buffers: Arc<[ByteBuffer]>,
369    views: Buffer<BinaryView>,
370    validity: Validity,
371    stats_set: ArrayStats,
372}
373
374#[derive(Clone, Debug)]
375pub struct VarBinViewEncoding;
376
377impl VarBinViewArray {
378    fn validate(
379        views: &Buffer<BinaryView>,
380        buffers: &Arc<[ByteBuffer]>,
381        dtype: &DType,
382        validity: &Validity,
383    ) -> VortexResult<()> {
384        vortex_ensure!(
385            validity.nullability() == dtype.nullability(),
386            "validity {:?} incompatible with nullability {:?}",
387            validity,
388            dtype.nullability()
389        );
390
391        match dtype {
392            DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
393                std::str::from_utf8(string).is_ok()
394            })?,
395            DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
396            _ => vortex_bail!("invalid DType {dtype}"),
397        }
398
399        Ok(())
400    }
401
402    fn validate_views<F>(
403        views: &Buffer<BinaryView>,
404        buffers: &Arc<[ByteBuffer]>,
405        validity: &Validity,
406        validator: F,
407    ) -> VortexResult<()>
408    where
409        F: Fn(&[u8]) -> bool,
410    {
411        for (idx, &view) in views.iter().enumerate() {
412            if validity.is_null(idx)? {
413                continue;
414            }
415
416            if view.is_inlined() {
417                // Validate the inline bytestring
418                let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
419                vortex_ensure!(
420                    validator(bytes),
421                    "view at index {idx}: inlined bytes failed utf-8 validation"
422                );
423            } else {
424                // Validate the view pointer
425                let view = view.as_view();
426                let buf_index = view.buffer_index as usize;
427                let start_offset = view.offset as usize;
428                let end_offset = start_offset.saturating_add(view.size as usize);
429
430                let buf = buffers.get(buf_index).ok_or_else(||
431                    vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
432                        buffers.len()))?;
433
434                vortex_ensure!(
435                    start_offset < buf.len(),
436                    "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
437                    buf.len(),
438                );
439
440                vortex_ensure!(
441                    end_offset <= buf.len(),
442                    "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
443                    buf.len(),
444                );
445
446                // Make sure the prefix data matches the buffer data.
447                let bytes = &buf[start_offset..end_offset];
448                vortex_ensure!(
449                    view.prefix == bytes[..4],
450                    "VarBinView prefix does not match full string"
451                );
452
453                // Validate the full string
454                vortex_ensure!(
455                    validator(bytes),
456                    "view at index {idx}: outlined bytes fails utf-8 validation"
457                );
458            }
459        }
460
461        Ok(())
462    }
463}
464
465impl VarBinViewArray {
466    /// Build a new `VarBinViewArray` from components with validation.
467    ///
468    /// # Safety
469    /// This should only be used when you know for certain that all components are already
470    /// validated, for example during array operations that preserve the invariants of the encoding.
471    ///
472    /// See [`VarBinViewArray::try_new`] for a safe constructor that does validation.
473    pub unsafe fn new_unchecked(
474        views: Buffer<BinaryView>,
475        buffers: Arc<[ByteBuffer]>,
476        dtype: DType,
477        validity: Validity,
478    ) -> Self {
479        Self {
480            dtype,
481            buffers,
482            views,
483            validity,
484            stats_set: Default::default(),
485        }
486    }
487
488    pub fn new(
489        views: Buffer<BinaryView>,
490        buffers: Arc<[ByteBuffer]>,
491        dtype: DType,
492        validity: Validity,
493    ) -> Self {
494        Self::try_new(views, buffers, dtype, validity).vortex_expect("VarBinViewArray new")
495    }
496
497    pub fn try_new(
498        views: Buffer<BinaryView>,
499        buffers: Arc<[ByteBuffer]>,
500        dtype: DType,
501        validity: Validity,
502    ) -> VortexResult<Self> {
503        Self::validate(&views, &buffers, &dtype, &validity)?;
504
505        Ok(Self {
506            dtype,
507            buffers,
508            views,
509            validity,
510            stats_set: Default::default(),
511        })
512    }
513
514    /// Number of raw string data buffers held by this array.
515    pub fn nbuffers(&self) -> usize {
516        self.buffers.len()
517    }
518
519    /// Access to the primitive views buffer.
520    ///
521    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
522    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
523    /// the string (if the string has 12 bytes or fewer).
524    #[inline]
525    pub fn views(&self) -> &Buffer<BinaryView> {
526        &self.views
527    }
528
529    /// Access value bytes at a given index
530    ///
531    /// Will return a `ByteBuffer` containing the data without performing a copy.
532    #[inline]
533    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
534        let views = self.views();
535        let view = &views[index];
536        // Expect this to be the common case: strings > 12 bytes.
537        if !view.is_inlined() {
538            let view_ref = view.as_view();
539            self.buffer(view_ref.buffer_index() as usize)
540                .slice(view_ref.to_range())
541        } else {
542            // Return access to the range of bytes around it.
543            views
544                .clone()
545                .into_byte_buffer()
546                .slice_ref(view.as_inlined().value())
547        }
548    }
549
550    /// Access one of the backing data buffers.
551    ///
552    /// # Panics
553    ///
554    /// This method panics if the provided index is out of bounds for the set of buffers provided
555    /// at construction time.
556    #[inline]
557    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
558        if idx >= self.nbuffers() {
559            vortex_panic!(
560                "{idx} buffer index out of bounds, there are {} buffers",
561                self.nbuffers()
562            );
563        }
564        &self.buffers[idx]
565    }
566
567    /// Iterate over the underlying raw data buffers, not including the views buffer.
568    #[inline]
569    pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
570        &self.buffers
571    }
572
573    /// Accumulate an iterable set of values into our type here.
574    #[allow(clippy::same_name_method)]
575    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
576        iter: I,
577        dtype: DType,
578    ) -> Self {
579        let iter = iter.into_iter();
580        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
581
582        for item in iter {
583            match item {
584                None => builder.append_null(),
585                Some(v) => builder.append_value(v),
586            }
587        }
588
589        builder.finish_into_varbinview()
590    }
591
592    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
593        let iter = iter.into_iter();
594        let mut builder = VarBinViewBuilder::with_capacity(
595            DType::Utf8(Nullability::NonNullable),
596            iter.size_hint().0,
597        );
598
599        for item in iter {
600            builder.append_value(item.as_ref());
601        }
602
603        builder.finish_into_varbinview()
604    }
605
606    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
607        iter: I,
608    ) -> Self {
609        let iter = iter.into_iter();
610        let mut builder = VarBinViewBuilder::with_capacity(
611            DType::Utf8(Nullability::Nullable),
612            iter.size_hint().0,
613        );
614
615        for item in iter {
616            match item {
617                None => builder.append_null(),
618                Some(v) => builder.append_value(v.as_ref()),
619            }
620        }
621
622        builder.finish_into_varbinview()
623    }
624
625    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
626        let iter = iter.into_iter();
627        let mut builder = VarBinViewBuilder::with_capacity(
628            DType::Binary(Nullability::NonNullable),
629            iter.size_hint().0,
630        );
631
632        for item in iter {
633            builder.append_value(item.as_ref());
634        }
635
636        builder.finish_into_varbinview()
637    }
638
639    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
640        iter: I,
641    ) -> Self {
642        let iter = iter.into_iter();
643        let mut builder = VarBinViewBuilder::with_capacity(
644            DType::Binary(Nullability::Nullable),
645            iter.size_hint().0,
646        );
647
648        for item in iter {
649            match item {
650                None => builder.append_null(),
651                Some(v) => builder.append_value(v.as_ref()),
652            }
653        }
654
655        builder.finish_into_varbinview()
656    }
657}
658
659impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
660    fn len(array: &VarBinViewArray) -> usize {
661        array.views.len()
662    }
663
664    fn dtype(array: &VarBinViewArray) -> &DType {
665        &array.dtype
666    }
667
668    fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
669        array.stats_set.to_ref(array.as_ref())
670    }
671}
672
673impl ValidityHelper for VarBinViewArray {
674    fn validity(&self) -> &Validity {
675        &self.validity
676    }
677}
678
679impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
680    fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
681        Ok(Canonical::VarBinView(array.clone()))
682    }
683
684    fn append_to_builder(
685        array: &VarBinViewArray,
686        builder: &mut dyn ArrayBuilder,
687    ) -> VortexResult<()> {
688        builder.extend_from_array(array.as_ref())
689    }
690}
691
692impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
693    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
694        Self::from_iter_nullable_bin(iter)
695    }
696}
697
698impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
699    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
700        Self::from_iter_nullable_bin(iter)
701    }
702}
703
704impl FromIterator<Option<String>> for VarBinViewArray {
705    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
706        Self::from_iter_nullable_str(iter)
707    }
708}
709
710impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
711    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
712        Self::from_iter_nullable_str(iter)
713    }
714}
715
716#[cfg(test)]
717mod test {
718    use vortex_scalar::Scalar;
719
720    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
721    use crate::{Array, Canonical, IntoArray};
722
723    #[test]
724    pub fn varbin_view() {
725        let binary_arr =
726            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
727        assert_eq!(binary_arr.len(), 2);
728        assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world"));
729        assert_eq!(
730            binary_arr.scalar_at(1),
731            Scalar::from("hello world this is a long string")
732        );
733    }
734
735    #[test]
736    pub fn slice_array() {
737        let binary_arr =
738            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
739                .slice(1, 2);
740        assert_eq!(
741            binary_arr.scalar_at(0),
742            Scalar::from("hello world this is a long string")
743        );
744    }
745
746    #[test]
747    pub fn flatten_array() {
748        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
749
750        let flattened = binary_arr.to_canonical().unwrap();
751        assert!(matches!(flattened, Canonical::VarBinView(_)));
752
753        let var_bin = flattened.into_varbinview().unwrap().into_array();
754        assert_eq!(var_bin.scalar_at(0), Scalar::from("string1"));
755        assert_eq!(var_bin.scalar_at(1), Scalar::from("string2"));
756    }
757
758    #[test]
759    pub fn binary_view_size_and_alignment() {
760        assert_eq!(size_of::<BinaryView>(), 16);
761        assert_eq!(align_of::<BinaryView>(), 16);
762    }
763}