vortex_array/arrays/varbinview/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::{Debug, Formatter};
5use std::ops::Range;
6use std::sync::Arc;
7
8use static_assertions::{assert_eq_align, assert_eq_size};
9use vortex_buffer::{Alignment, Buffer, ByteBuffer};
10use vortex_dtype::{DType, Nullability};
11use vortex_error::{VortexResult, VortexUnwrap, vortex_bail, vortex_panic};
12
13use crate::builders::{ArrayBuilder, VarBinViewBuilder};
14use crate::stats::{ArrayStats, StatsSetRef};
15use crate::validity::Validity;
16use crate::vtable::{
17    ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
18    ValidityVTableFromValidityHelper,
19};
20use crate::{Canonical, EncodingId, EncodingRef, vtable};
21
22mod accessor;
23mod compact;
24mod compute;
25mod ops;
26mod serde;
27
28#[derive(Clone, Copy, Debug, PartialEq, Eq)]
29#[repr(C, align(8))]
30pub struct Inlined {
31    size: u32,
32    data: [u8; BinaryView::MAX_INLINED_SIZE],
33}
34
35impl Inlined {
36    fn new<const N: usize>(value: &[u8]) -> Self {
37        let mut inlined = Self {
38            size: N.try_into().vortex_unwrap(),
39            data: [0u8; BinaryView::MAX_INLINED_SIZE],
40        };
41        inlined.data[..N].copy_from_slice(&value[..N]);
42        inlined
43    }
44
45    #[inline]
46    pub fn value(&self) -> &[u8] {
47        &self.data[0..(self.size as usize)]
48    }
49}
50
51#[derive(Clone, Copy, Debug)]
52#[repr(C, align(8))]
53pub struct Ref {
54    size: u32,
55    prefix: [u8; 4],
56    buffer_index: u32,
57    offset: u32,
58}
59
60impl Ref {
61    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
62        Self {
63            size,
64            prefix,
65            buffer_index,
66            offset,
67        }
68    }
69
70    #[inline]
71    pub fn buffer_index(&self) -> u32 {
72        self.buffer_index
73    }
74
75    #[inline]
76    pub fn offset(&self) -> u32 {
77        self.offset
78    }
79
80    #[inline]
81    pub fn prefix(&self) -> &[u8; 4] {
82        &self.prefix
83    }
84
85    #[inline]
86    pub fn to_range(&self) -> Range<usize> {
87        self.offset as usize..(self.offset + self.size) as usize
88    }
89}
90
91#[derive(Clone, Copy)]
92#[repr(C, align(16))]
93pub union BinaryView {
94    // Numeric representation. This is logically `u128`, but we split it into the high and low
95    // bits to preserve the alignment.
96    le_bytes: [u8; 16],
97
98    // Inlined representation: strings <= 12 bytes
99    inlined: Inlined,
100
101    // Reference type: strings > 12 bytes.
102    _ref: Ref,
103}
104
105assert_eq_size!(BinaryView, [u8; 16]);
106assert_eq_size!(Inlined, [u8; 16]);
107assert_eq_size!(Ref, [u8; 16]);
108assert_eq_align!(BinaryView, u128);
109
110impl BinaryView {
111    pub const MAX_INLINED_SIZE: usize = 12;
112
113    /// Create a view from a value, block and offset
114    ///
115    /// Depending on the length of the provided value either a new inlined
116    /// or a reference view will be constructed.
117    ///
118    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
119    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
120    #[inline(never)]
121    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
122        match value.len() {
123            0 => Self {
124                inlined: Inlined::new::<0>(value),
125            },
126            1 => Self {
127                inlined: Inlined::new::<1>(value),
128            },
129            2 => Self {
130                inlined: Inlined::new::<2>(value),
131            },
132            3 => Self {
133                inlined: Inlined::new::<3>(value),
134            },
135            4 => Self {
136                inlined: Inlined::new::<4>(value),
137            },
138            5 => Self {
139                inlined: Inlined::new::<5>(value),
140            },
141            6 => Self {
142                inlined: Inlined::new::<6>(value),
143            },
144            7 => Self {
145                inlined: Inlined::new::<7>(value),
146            },
147            8 => Self {
148                inlined: Inlined::new::<8>(value),
149            },
150            9 => Self {
151                inlined: Inlined::new::<9>(value),
152            },
153            10 => Self {
154                inlined: Inlined::new::<10>(value),
155            },
156            11 => Self {
157                inlined: Inlined::new::<11>(value),
158            },
159            12 => Self {
160                inlined: Inlined::new::<12>(value),
161            },
162            _ => Self {
163                _ref: Ref::new(
164                    u32::try_from(value.len()).vortex_unwrap(),
165                    value[0..4].try_into().vortex_unwrap(),
166                    block,
167                    offset,
168                ),
169            },
170        }
171    }
172
173    /// Create a new empty view
174    #[inline]
175    pub fn empty_view() -> Self {
176        Self::new_inlined(&[])
177    }
178
179    /// Create a new inlined binary view
180    #[inline]
181    pub fn new_inlined(value: &[u8]) -> Self {
182        assert!(
183            value.len() <= Self::MAX_INLINED_SIZE,
184            "expected inlined value to be <= 12 bytes, was {}",
185            value.len()
186        );
187
188        Self::make_view(value, 0, 0)
189    }
190
191    #[inline]
192    pub fn len(&self) -> u32 {
193        unsafe { self.inlined.size }
194    }
195
196    #[inline]
197    pub fn is_empty(&self) -> bool {
198        self.len() > 0
199    }
200
201    #[inline]
202    #[allow(clippy::cast_possible_truncation)]
203    pub fn is_inlined(&self) -> bool {
204        self.len() <= (Self::MAX_INLINED_SIZE as u32)
205    }
206
207    pub fn as_inlined(&self) -> &Inlined {
208        unsafe { &self.inlined }
209    }
210
211    pub fn as_view(&self) -> &Ref {
212        unsafe { &self._ref }
213    }
214
215    pub fn as_u128(&self) -> u128 {
216        // SAFETY: binary view always safe to read as u128 LE bytes
217        unsafe { u128::from_le_bytes(self.le_bytes) }
218    }
219
220    /// Override the buffer reference with the given buffer_idx, only if this view is not inlined.
221    #[inline(always)]
222    pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
223        if self.is_inlined() {
224            self
225        } else {
226            // Referencing views must have their buffer_index adjusted with new offsets
227            let view_ref = self.as_view();
228            Self {
229                _ref: Ref::new(
230                    self.len(),
231                    *view_ref.prefix(),
232                    buffer_idx,
233                    view_ref.offset(),
234                ),
235            }
236        }
237    }
238
239    /// Shifts the buffer reference by the view by a given offset, useful when merging many
240    /// varbinview arrays into one.
241    #[inline(always)]
242    pub fn offset_view(self, offset: u32) -> Self {
243        if self.is_inlined() {
244            self
245        } else {
246            // Referencing views must have their buffer_index adjusted with new offsets
247            let view_ref = self.as_view();
248            Self {
249                _ref: Ref::new(
250                    self.len(),
251                    *view_ref.prefix(),
252                    offset + view_ref.buffer_index(),
253                    view_ref.offset(),
254                ),
255            }
256        }
257    }
258}
259
260impl From<u128> for BinaryView {
261    fn from(value: u128) -> Self {
262        BinaryView {
263            le_bytes: value.to_le_bytes(),
264        }
265    }
266}
267
268impl Debug for BinaryView {
269    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
270        let mut s = f.debug_struct("BinaryView");
271        if self.is_inlined() {
272            s.field("inline", &"i".to_string());
273        } else {
274            s.field("ref", &"r".to_string());
275        }
276        s.finish()
277    }
278}
279
280vtable!(VarBinView);
281
282impl VTable for VarBinViewVTable {
283    type Array = VarBinViewArray;
284    type Encoding = VarBinViewEncoding;
285
286    type ArrayVTable = Self;
287    type CanonicalVTable = Self;
288    type OperationsVTable = Self;
289    type ValidityVTable = ValidityVTableFromValidityHelper;
290    type VisitorVTable = Self;
291    type ComputeVTable = NotSupported;
292    type EncodeVTable = NotSupported;
293    type SerdeVTable = Self;
294
295    fn id(_encoding: &Self::Encoding) -> EncodingId {
296        EncodingId::new_ref("vortex.varbinview")
297    }
298
299    fn encoding(_array: &Self::Array) -> EncodingRef {
300        EncodingRef::new_ref(VarBinViewEncoding.as_ref())
301    }
302}
303
304/// A variable-length binary view array that stores strings and binary data efficiently.
305///
306/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
307/// an optimized representation for variable-length data with excellent performance
308/// characteristics for both short and long strings.
309///
310/// ## Data Layout
311///
312/// The array uses a hybrid storage approach with two main components:
313/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
314/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
315///
316/// ## View Structure
317///
318/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
319/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
320/// - **Reference data**: For strings > 12 bytes, contains:
321///   - String length (4 bytes)
322///   - First 4 bytes of string as prefix (4 bytes)
323///   - Buffer index and offset (8 bytes total)
324///
325/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
326///
327/// ```text
328///                         ┌──────┬────────────────────────┐
329///                         │length│      string value      │
330///    Strings (len <= 12)  │      │    (padded with 0)     │
331///                         └──────┴────────────────────────┘
332///                          0    31                      127
333///
334///                         ┌───────┬───────┬───────┬───────┐
335///                         │length │prefix │  buf  │offset │
336///    Strings (len > 12)   │       │       │ index │       │
337///                         └───────┴───────┴───────┴───────┘
338///                          0    31       63      95    127
339/// ```
340///
341/// # Examples
342///
343/// ```
344/// use vortex_array::arrays::VarBinViewArray;
345/// use vortex_dtype::{DType, Nullability};
346/// use vortex_array::IntoArray;
347///
348/// // Create from an Iterator<Item = &str>
349/// let array = VarBinViewArray::from_iter_str([
350///         "inlined",
351///         "this string is outlined"
352/// ]);
353///
354/// assert_eq!(array.len(), 2);
355///
356/// // Access individual strings
357/// let first = array.bytes_at(0);
358/// assert_eq!(first.as_slice(), b"inlined"); // "short"
359///
360/// let second = array.bytes_at(1);
361/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
362/// ```
363#[derive(Clone, Debug)]
364pub struct VarBinViewArray {
365    dtype: DType,
366    buffers: Arc<[ByteBuffer]>,
367    views: Buffer<BinaryView>,
368    validity: Validity,
369    stats_set: ArrayStats,
370}
371
372#[derive(Clone, Debug)]
373pub struct VarBinViewEncoding;
374
375impl VarBinViewArray {
376    pub fn try_new(
377        views: Buffer<BinaryView>,
378        buffers: Arc<[ByteBuffer]>,
379        dtype: DType,
380        validity: Validity,
381    ) -> VortexResult<Self> {
382        if views.alignment() != Alignment::of::<BinaryView>() {
383            vortex_bail!("Views must be aligned to a 128 bits");
384        }
385
386        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
387            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
388        }
389
390        if dtype.is_nullable() == (validity == Validity::NonNullable) {
391            vortex_bail!("incorrect validity {:?}", validity);
392        }
393
394        Ok(Self {
395            dtype,
396            buffers,
397            views,
398            validity,
399            stats_set: Default::default(),
400        })
401    }
402
403    /// Number of raw string data buffers held by this array.
404    pub fn nbuffers(&self) -> usize {
405        self.buffers.len()
406    }
407
408    /// Access to the primitive views buffer.
409    ///
410    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
411    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
412    /// the string (if the string has 12 bytes or fewer).
413    #[inline]
414    pub fn views(&self) -> &Buffer<BinaryView> {
415        &self.views
416    }
417
418    /// Access value bytes at a given index
419    ///
420    /// Will return a bytebuffer pointing to the underlying data without performing a copy
421    #[inline]
422    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
423        let views = self.views();
424        let view = &views[index];
425        // Expect this to be the common case: strings > 12 bytes.
426        if !view.is_inlined() {
427            let view_ref = view.as_view();
428            self.buffer(view_ref.buffer_index() as usize)
429                .slice(view_ref.to_range())
430        } else {
431            // Return access to the range of bytes around it.
432            views
433                .clone()
434                .into_byte_buffer()
435                .slice_ref(view.as_inlined().value())
436        }
437    }
438
439    /// Access one of the backing data buffers.
440    ///
441    /// # Panics
442    ///
443    /// This method panics if the provided index is out of bounds for the set of buffers provided
444    /// at construction time.
445    #[inline]
446    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
447        if idx >= self.nbuffers() {
448            vortex_panic!(
449                "{idx} buffer index out of bounds, there are {} buffers",
450                self.nbuffers()
451            );
452        }
453        &self.buffers[idx]
454    }
455
456    /// Iterate over the underlying raw data buffers, not including the views buffer.
457    #[inline]
458    pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
459        &self.buffers
460    }
461
462    /// Accumulate an iterable set of values into our type here.
463    #[allow(clippy::same_name_method)]
464    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
465        iter: I,
466        dtype: DType,
467    ) -> Self {
468        let iter = iter.into_iter();
469        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
470
471        for item in iter {
472            match item {
473                None => builder.append_null(),
474                Some(v) => builder.append_value(v),
475            }
476        }
477
478        builder.finish_into_varbinview()
479    }
480
481    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
482        let iter = iter.into_iter();
483        let mut builder = VarBinViewBuilder::with_capacity(
484            DType::Utf8(Nullability::NonNullable),
485            iter.size_hint().0,
486        );
487
488        for item in iter {
489            builder.append_value(item.as_ref());
490        }
491
492        builder.finish_into_varbinview()
493    }
494
495    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
496        iter: I,
497    ) -> Self {
498        let iter = iter.into_iter();
499        let mut builder = VarBinViewBuilder::with_capacity(
500            DType::Utf8(Nullability::Nullable),
501            iter.size_hint().0,
502        );
503
504        for item in iter {
505            match item {
506                None => builder.append_null(),
507                Some(v) => builder.append_value(v.as_ref()),
508            }
509        }
510
511        builder.finish_into_varbinview()
512    }
513
514    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
515        let iter = iter.into_iter();
516        let mut builder = VarBinViewBuilder::with_capacity(
517            DType::Binary(Nullability::NonNullable),
518            iter.size_hint().0,
519        );
520
521        for item in iter {
522            builder.append_value(item.as_ref());
523        }
524
525        builder.finish_into_varbinview()
526    }
527
528    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
529        iter: I,
530    ) -> Self {
531        let iter = iter.into_iter();
532        let mut builder = VarBinViewBuilder::with_capacity(
533            DType::Binary(Nullability::Nullable),
534            iter.size_hint().0,
535        );
536
537        for item in iter {
538            match item {
539                None => builder.append_null(),
540                Some(v) => builder.append_value(v.as_ref()),
541            }
542        }
543
544        builder.finish_into_varbinview()
545    }
546}
547
548impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
549    fn len(array: &VarBinViewArray) -> usize {
550        array.views.len()
551    }
552
553    fn dtype(array: &VarBinViewArray) -> &DType {
554        &array.dtype
555    }
556
557    fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
558        array.stats_set.to_ref(array.as_ref())
559    }
560}
561
562impl ValidityHelper for VarBinViewArray {
563    fn validity(&self) -> &Validity {
564        &self.validity
565    }
566}
567
568impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
569    fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
570        Ok(Canonical::VarBinView(array.clone()))
571    }
572
573    fn append_to_builder(
574        array: &VarBinViewArray,
575        builder: &mut dyn ArrayBuilder,
576    ) -> VortexResult<()> {
577        builder.extend_from_array(array.as_ref())
578    }
579}
580
581impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
582    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
583        Self::from_iter_nullable_bin(iter)
584    }
585}
586
587impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
588    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
589        Self::from_iter_nullable_bin(iter)
590    }
591}
592
593impl FromIterator<Option<String>> for VarBinViewArray {
594    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
595        Self::from_iter_nullable_str(iter)
596    }
597}
598
599impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
600    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
601        Self::from_iter_nullable_str(iter)
602    }
603}
604
605#[cfg(test)]
606mod test {
607    use vortex_scalar::Scalar;
608
609    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
610    use crate::{Array, Canonical, IntoArray};
611
612    #[test]
613    pub fn varbin_view() {
614        let binary_arr =
615            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
616        assert_eq!(binary_arr.len(), 2);
617        assert_eq!(
618            binary_arr.scalar_at(0).unwrap(),
619            Scalar::from("hello world")
620        );
621        assert_eq!(
622            binary_arr.scalar_at(1).unwrap(),
623            Scalar::from("hello world this is a long string")
624        );
625    }
626
627    #[test]
628    pub fn slice_array() {
629        let binary_arr =
630            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
631                .slice(1, 2)
632                .unwrap();
633        assert_eq!(
634            binary_arr.scalar_at(0).unwrap(),
635            Scalar::from("hello world this is a long string")
636        );
637    }
638
639    #[test]
640    pub fn flatten_array() {
641        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
642
643        let flattened = binary_arr.to_canonical().unwrap();
644        assert!(matches!(flattened, Canonical::VarBinView(_)));
645
646        let var_bin = flattened.into_varbinview().unwrap().into_array();
647        assert_eq!(var_bin.scalar_at(0).unwrap(), Scalar::from("string1"));
648        assert_eq!(var_bin.scalar_at(1).unwrap(), Scalar::from("string2"));
649    }
650
651    #[test]
652    pub fn binary_view_size_and_alignment() {
653        assert_eq!(size_of::<BinaryView>(), 16);
654        assert_eq!(align_of::<BinaryView>(), 16);
655    }
656}