vortex_array/arrays/varbinview/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::{Debug, Formatter};
5use std::hash::{Hash, Hasher};
6use std::ops::Range;
7use std::sync::Arc;
8
9use static_assertions::{assert_eq_align, assert_eq_size};
10use vortex_buffer::{Buffer, ByteBuffer};
11use vortex_dtype::{DType, Nullability};
12use vortex_error::{
13    VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
14};
15
16use crate::builders::{ArrayBuilder, VarBinViewBuilder};
17use crate::stats::{ArrayStats, StatsSetRef};
18use crate::validity::Validity;
19use crate::vtable::{
20    ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper,
21    ValidityVTableFromValidityHelper,
22};
23use crate::{Canonical, EncodingId, EncodingRef, vtable};
24
25mod accessor;
26mod compact;
27mod compute;
28mod ops;
29mod serde;
30
31#[derive(Clone, Copy, Debug, PartialEq, Eq)]
32#[repr(C, align(8))]
33pub struct Inlined {
34    size: u32,
35    data: [u8; BinaryView::MAX_INLINED_SIZE],
36}
37
38impl Inlined {
39    fn new<const N: usize>(value: &[u8]) -> Self {
40        let mut inlined = Self {
41            size: N.try_into().vortex_unwrap(),
42            data: [0u8; BinaryView::MAX_INLINED_SIZE],
43        };
44        inlined.data[..N].copy_from_slice(&value[..N]);
45        inlined
46    }
47
48    #[inline]
49    pub fn value(&self) -> &[u8] {
50        &self.data[0..(self.size as usize)]
51    }
52}
53
54#[derive(Clone, Copy, Debug)]
55#[repr(C, align(8))]
56pub struct Ref {
57    size: u32,
58    prefix: [u8; 4],
59    buffer_index: u32,
60    offset: u32,
61}
62
63impl Ref {
64    pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self {
65        Self {
66            size,
67            prefix,
68            buffer_index,
69            offset,
70        }
71    }
72
73    #[inline]
74    pub fn buffer_index(&self) -> u32 {
75        self.buffer_index
76    }
77
78    #[inline]
79    pub fn offset(&self) -> u32 {
80        self.offset
81    }
82
83    #[inline]
84    pub fn prefix(&self) -> &[u8; 4] {
85        &self.prefix
86    }
87
88    #[inline]
89    pub fn to_range(&self) -> Range<usize> {
90        self.offset as usize..(self.offset + self.size) as usize
91    }
92}
93
94#[derive(Clone, Copy)]
95#[repr(C, align(16))]
96pub union BinaryView {
97    // Numeric representation. This is logically `u128`, but we split it into the high and low
98    // bits to preserve the alignment.
99    le_bytes: [u8; 16],
100
101    // Inlined representation: strings <= 12 bytes
102    inlined: Inlined,
103
104    // Reference type: strings > 12 bytes.
105    _ref: Ref,
106}
107
108assert_eq_size!(BinaryView, [u8; 16]);
109assert_eq_size!(Inlined, [u8; 16]);
110assert_eq_size!(Ref, [u8; 16]);
111assert_eq_align!(BinaryView, u128);
112
113impl Hash for BinaryView {
114    fn hash<H: Hasher>(&self, state: &mut H) {
115        unsafe { std::mem::transmute::<&BinaryView, &[u8; 16]>(self) }.hash(state);
116    }
117}
118
119impl BinaryView {
120    pub const MAX_INLINED_SIZE: usize = 12;
121
122    /// Create a view from a value, block and offset
123    ///
124    /// Depending on the length of the provided value either a new inlined
125    /// or a reference view will be constructed.
126    ///
127    /// Adapted from arrow-rs <https://github.com/apache/arrow-rs/blob/f4fde769ab6e1a9b75f890b7f8b47bc22800830b/arrow-array/src/builder/generic_bytes_view_builder.rs#L524>
128    /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores
129    #[inline(never)]
130    pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self {
131        match value.len() {
132            0 => Self {
133                inlined: Inlined::new::<0>(value),
134            },
135            1 => Self {
136                inlined: Inlined::new::<1>(value),
137            },
138            2 => Self {
139                inlined: Inlined::new::<2>(value),
140            },
141            3 => Self {
142                inlined: Inlined::new::<3>(value),
143            },
144            4 => Self {
145                inlined: Inlined::new::<4>(value),
146            },
147            5 => Self {
148                inlined: Inlined::new::<5>(value),
149            },
150            6 => Self {
151                inlined: Inlined::new::<6>(value),
152            },
153            7 => Self {
154                inlined: Inlined::new::<7>(value),
155            },
156            8 => Self {
157                inlined: Inlined::new::<8>(value),
158            },
159            9 => Self {
160                inlined: Inlined::new::<9>(value),
161            },
162            10 => Self {
163                inlined: Inlined::new::<10>(value),
164            },
165            11 => Self {
166                inlined: Inlined::new::<11>(value),
167            },
168            12 => Self {
169                inlined: Inlined::new::<12>(value),
170            },
171            _ => Self {
172                _ref: Ref::new(
173                    u32::try_from(value.len()).vortex_unwrap(),
174                    value[0..4].try_into().vortex_unwrap(),
175                    block,
176                    offset,
177                ),
178            },
179        }
180    }
181
182    /// Create a new empty view
183    #[inline]
184    pub fn empty_view() -> Self {
185        Self::new_inlined(&[])
186    }
187
188    /// Create a new inlined binary view
189    #[inline]
190    pub fn new_inlined(value: &[u8]) -> Self {
191        assert!(
192            value.len() <= Self::MAX_INLINED_SIZE,
193            "expected inlined value to be <= 12 bytes, was {}",
194            value.len()
195        );
196
197        Self::make_view(value, 0, 0)
198    }
199
200    #[inline]
201    pub fn len(&self) -> u32 {
202        unsafe { self.inlined.size }
203    }
204
205    #[inline]
206    pub fn is_empty(&self) -> bool {
207        self.len() > 0
208    }
209
210    #[inline]
211    #[allow(clippy::cast_possible_truncation)]
212    pub fn is_inlined(&self) -> bool {
213        self.len() <= (Self::MAX_INLINED_SIZE as u32)
214    }
215
216    pub fn as_inlined(&self) -> &Inlined {
217        unsafe { &self.inlined }
218    }
219
220    pub fn as_view(&self) -> &Ref {
221        unsafe { &self._ref }
222    }
223
224    pub fn as_u128(&self) -> u128 {
225        // SAFETY: binary view always safe to read as u128 LE bytes
226        unsafe { u128::from_le_bytes(self.le_bytes) }
227    }
228
229    /// Override the buffer reference with the given buffer_idx, only if this view is not inlined.
230    #[inline(always)]
231    pub fn with_buffer_idx(self, buffer_idx: u32) -> Self {
232        if self.is_inlined() {
233            self
234        } else {
235            // Referencing views must have their buffer_index adjusted with new offsets
236            let view_ref = self.as_view();
237            Self {
238                _ref: Ref::new(
239                    self.len(),
240                    *view_ref.prefix(),
241                    buffer_idx,
242                    view_ref.offset(),
243                ),
244            }
245        }
246    }
247
248    /// Shifts the buffer reference by the view by a given offset, useful when merging many
249    /// varbinview arrays into one.
250    #[inline(always)]
251    pub fn offset_view(self, offset: u32) -> Self {
252        if self.is_inlined() {
253            self
254        } else {
255            // Referencing views must have their buffer_index adjusted with new offsets
256            let view_ref = self.as_view();
257            Self {
258                _ref: Ref::new(
259                    self.len(),
260                    *view_ref.prefix(),
261                    offset + view_ref.buffer_index(),
262                    view_ref.offset(),
263                ),
264            }
265        }
266    }
267}
268
269impl From<u128> for BinaryView {
270    fn from(value: u128) -> Self {
271        BinaryView {
272            le_bytes: value.to_le_bytes(),
273        }
274    }
275}
276
277impl Debug for BinaryView {
278    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
279        let mut s = f.debug_struct("BinaryView");
280        if self.is_inlined() {
281            s.field("inline", &"i".to_string());
282        } else {
283            s.field("ref", &"r".to_string());
284        }
285        s.finish()
286    }
287}
288
289vtable!(VarBinView);
290
291impl VTable for VarBinViewVTable {
292    type Array = VarBinViewArray;
293    type Encoding = VarBinViewEncoding;
294
295    type ArrayVTable = Self;
296    type CanonicalVTable = Self;
297    type OperationsVTable = Self;
298    type ValidityVTable = ValidityVTableFromValidityHelper;
299    type VisitorVTable = Self;
300    type ComputeVTable = NotSupported;
301    type EncodeVTable = NotSupported;
302    type PipelineVTable = NotSupported;
303    type SerdeVTable = Self;
304
305    fn id(_encoding: &Self::Encoding) -> EncodingId {
306        EncodingId::new_ref("vortex.varbinview")
307    }
308
309    fn encoding(_array: &Self::Array) -> EncodingRef {
310        EncodingRef::new_ref(VarBinViewEncoding.as_ref())
311    }
312}
313
314/// A variable-length binary view array that stores strings and binary data efficiently.
315///
316/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
317/// an optimized representation for variable-length data with excellent performance
318/// characteristics for both short and long strings.
319///
320/// ## Data Layout
321///
322/// The array uses a hybrid storage approach with two main components:
323/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
324/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
325///
326/// ## View Structure
327///
328/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
329/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
330/// - **Reference data**: For strings > 12 bytes, contains:
331///   - String length (4 bytes)
332///   - First 4 bytes of string as prefix (4 bytes)
333///   - Buffer index and offset (8 bytes total)
334///
335/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
336///
337/// ```text
338///                         ┌──────┬────────────────────────┐
339///                         │length│      string value      │
340///    Strings (len <= 12)  │      │    (padded with 0)     │
341///                         └──────┴────────────────────────┘
342///                          0    31                      127
343///
344///                         ┌───────┬───────┬───────┬───────┐
345///                         │length │prefix │  buf  │offset │
346///    Strings (len > 12)   │       │       │ index │       │
347///                         └───────┴───────┴───────┴───────┘
348///                          0    31       63      95    127
349/// ```
350///
351/// # Examples
352///
353/// ```
354/// use vortex_array::arrays::VarBinViewArray;
355/// use vortex_dtype::{DType, Nullability};
356/// use vortex_array::IntoArray;
357///
358/// // Create from an Iterator<Item = &str>
359/// let array = VarBinViewArray::from_iter_str([
360///         "inlined",
361///         "this string is outlined"
362/// ]);
363///
364/// assert_eq!(array.len(), 2);
365///
366/// // Access individual strings
367/// let first = array.bytes_at(0);
368/// assert_eq!(first.as_slice(), b"inlined"); // "short"
369///
370/// let second = array.bytes_at(1);
371/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
372/// ```
373#[derive(Clone, Debug)]
374pub struct VarBinViewArray {
375    dtype: DType,
376    buffers: Arc<[ByteBuffer]>,
377    views: Buffer<BinaryView>,
378    validity: Validity,
379    stats_set: ArrayStats,
380}
381
382#[derive(Clone, Debug)]
383pub struct VarBinViewEncoding;
384
385impl VarBinViewArray {
386    fn validate(
387        views: &Buffer<BinaryView>,
388        buffers: &Arc<[ByteBuffer]>,
389        dtype: &DType,
390        validity: &Validity,
391    ) -> VortexResult<()> {
392        vortex_ensure!(
393            validity.nullability() == dtype.nullability(),
394            "validity {:?} incompatible with nullability {:?}",
395            validity,
396            dtype.nullability()
397        );
398
399        match dtype {
400            DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
401                simdutf8::basic::from_utf8(string).is_ok()
402            })?,
403            DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
404            _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
405        }
406
407        Ok(())
408    }
409
410    fn validate_views<F>(
411        views: &Buffer<BinaryView>,
412        buffers: &Arc<[ByteBuffer]>,
413        validity: &Validity,
414        validator: F,
415    ) -> VortexResult<()>
416    where
417        F: Fn(&[u8]) -> bool,
418    {
419        for (idx, &view) in views.iter().enumerate() {
420            if validity.is_null(idx)? {
421                continue;
422            }
423
424            if view.is_inlined() {
425                // Validate the inline bytestring
426                let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
427                vortex_ensure!(
428                    validator(bytes),
429                    "view at index {idx}: inlined bytes failed utf-8 validation"
430                );
431            } else {
432                // Validate the view pointer
433                let view = view.as_view();
434                let buf_index = view.buffer_index as usize;
435                let start_offset = view.offset as usize;
436                let end_offset = start_offset.saturating_add(view.size as usize);
437
438                let buf = buffers.get(buf_index).ok_or_else(||
439                    vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
440                        buffers.len()))?;
441
442                vortex_ensure!(
443                    start_offset < buf.len(),
444                    "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
445                    buf.len(),
446                );
447
448                vortex_ensure!(
449                    end_offset <= buf.len(),
450                    "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
451                    buf.len(),
452                );
453
454                // Make sure the prefix data matches the buffer data.
455                let bytes = &buf[start_offset..end_offset];
456                vortex_ensure!(
457                    view.prefix == bytes[..4],
458                    "VarBinView prefix does not match full string"
459                );
460
461                // Validate the full string
462                vortex_ensure!(
463                    validator(bytes),
464                    "view at index {idx}: outlined bytes fails utf-8 validation"
465                );
466            }
467        }
468
469        Ok(())
470    }
471}
472
473impl VarBinViewArray {
474    /// Build a new `VarBinViewArray` from components with validation.
475    ///
476    /// # Safety
477    /// This should only be used when you know for certain that all components are already
478    /// validated, for example during array operations that preserve the invariants of the encoding.
479    ///
480    /// See [`VarBinViewArray::try_new`] for a safe constructor that does validation.
481    pub unsafe fn new_unchecked(
482        views: Buffer<BinaryView>,
483        buffers: Arc<[ByteBuffer]>,
484        dtype: DType,
485        validity: Validity,
486    ) -> Self {
487        Self {
488            dtype,
489            buffers,
490            views,
491            validity,
492            stats_set: Default::default(),
493        }
494    }
495
496    pub fn new(
497        views: Buffer<BinaryView>,
498        buffers: Arc<[ByteBuffer]>,
499        dtype: DType,
500        validity: Validity,
501    ) -> Self {
502        Self::try_new(views, buffers, dtype, validity).vortex_expect("VarBinViewArray new")
503    }
504
505    pub fn try_new(
506        views: Buffer<BinaryView>,
507        buffers: Arc<[ByteBuffer]>,
508        dtype: DType,
509        validity: Validity,
510    ) -> VortexResult<Self> {
511        Self::validate(&views, &buffers, &dtype, &validity)?;
512
513        Ok(Self {
514            dtype,
515            buffers,
516            views,
517            validity,
518            stats_set: Default::default(),
519        })
520    }
521
522    /// Number of raw string data buffers held by this array.
523    pub fn nbuffers(&self) -> usize {
524        self.buffers.len()
525    }
526
527    /// Access to the primitive views buffer.
528    ///
529    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
530    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
531    /// the string (if the string has 12 bytes or fewer).
532    #[inline]
533    pub fn views(&self) -> &Buffer<BinaryView> {
534        &self.views
535    }
536
537    /// Access value bytes at a given index
538    ///
539    /// Will return a `ByteBuffer` containing the data without performing a copy.
540    #[inline]
541    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
542        let views = self.views();
543        let view = &views[index];
544        // Expect this to be the common case: strings > 12 bytes.
545        if !view.is_inlined() {
546            let view_ref = view.as_view();
547            self.buffer(view_ref.buffer_index() as usize)
548                .slice(view_ref.to_range())
549        } else {
550            // Return access to the range of bytes around it.
551            views
552                .clone()
553                .into_byte_buffer()
554                .slice_ref(view.as_inlined().value())
555        }
556    }
557
558    /// Access one of the backing data buffers.
559    ///
560    /// # Panics
561    ///
562    /// This method panics if the provided index is out of bounds for the set of buffers provided
563    /// at construction time.
564    #[inline]
565    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
566        if idx >= self.nbuffers() {
567            vortex_panic!(
568                "{idx} buffer index out of bounds, there are {} buffers",
569                self.nbuffers()
570            );
571        }
572        &self.buffers[idx]
573    }
574
575    /// Iterate over the underlying raw data buffers, not including the views buffer.
576    #[inline]
577    pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
578        &self.buffers
579    }
580
581    /// Accumulate an iterable set of values into our type here.
582    #[allow(clippy::same_name_method)]
583    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
584        iter: I,
585        dtype: DType,
586    ) -> Self {
587        let iter = iter.into_iter();
588        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
589
590        for item in iter {
591            match item {
592                None => builder.append_null(),
593                Some(v) => builder.append_value(v),
594            }
595        }
596
597        builder.finish_into_varbinview()
598    }
599
600    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
601        let iter = iter.into_iter();
602        let mut builder = VarBinViewBuilder::with_capacity(
603            DType::Utf8(Nullability::NonNullable),
604            iter.size_hint().0,
605        );
606
607        for item in iter {
608            builder.append_value(item.as_ref());
609        }
610
611        builder.finish_into_varbinview()
612    }
613
614    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
615        iter: I,
616    ) -> Self {
617        let iter = iter.into_iter();
618        let mut builder = VarBinViewBuilder::with_capacity(
619            DType::Utf8(Nullability::Nullable),
620            iter.size_hint().0,
621        );
622
623        for item in iter {
624            match item {
625                None => builder.append_null(),
626                Some(v) => builder.append_value(v.as_ref()),
627            }
628        }
629
630        builder.finish_into_varbinview()
631    }
632
633    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
634        let iter = iter.into_iter();
635        let mut builder = VarBinViewBuilder::with_capacity(
636            DType::Binary(Nullability::NonNullable),
637            iter.size_hint().0,
638        );
639
640        for item in iter {
641            builder.append_value(item.as_ref());
642        }
643
644        builder.finish_into_varbinview()
645    }
646
647    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
648        iter: I,
649    ) -> Self {
650        let iter = iter.into_iter();
651        let mut builder = VarBinViewBuilder::with_capacity(
652            DType::Binary(Nullability::Nullable),
653            iter.size_hint().0,
654        );
655
656        for item in iter {
657            match item {
658                None => builder.append_null(),
659                Some(v) => builder.append_value(v.as_ref()),
660            }
661        }
662
663        builder.finish_into_varbinview()
664    }
665}
666
667impl ArrayVTable<VarBinViewVTable> for VarBinViewVTable {
668    fn len(array: &VarBinViewArray) -> usize {
669        array.views.len()
670    }
671
672    fn dtype(array: &VarBinViewArray) -> &DType {
673        &array.dtype
674    }
675
676    fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> {
677        array.stats_set.to_ref(array.as_ref())
678    }
679}
680
681impl ValidityHelper for VarBinViewArray {
682    fn validity(&self) -> &Validity {
683        &self.validity
684    }
685}
686
687impl CanonicalVTable<VarBinViewVTable> for VarBinViewVTable {
688    fn canonicalize(array: &VarBinViewArray) -> VortexResult<Canonical> {
689        Ok(Canonical::VarBinView(array.clone()))
690    }
691
692    fn append_to_builder(
693        array: &VarBinViewArray,
694        builder: &mut dyn ArrayBuilder,
695    ) -> VortexResult<()> {
696        builder.extend_from_array(array.as_ref())
697    }
698}
699
700impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
701    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
702        Self::from_iter_nullable_bin(iter)
703    }
704}
705
706impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
707    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
708        Self::from_iter_nullable_bin(iter)
709    }
710}
711
712impl FromIterator<Option<String>> for VarBinViewArray {
713    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
714        Self::from_iter_nullable_str(iter)
715    }
716}
717
718impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
719    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
720        Self::from_iter_nullable_str(iter)
721    }
722}
723
724#[cfg(test)]
725mod test {
726    use vortex_scalar::Scalar;
727
728    use crate::arrays::varbinview::{BinaryView, VarBinViewArray};
729    use crate::{Array, Canonical, IntoArray};
730
731    #[test]
732    pub fn varbin_view() {
733        let binary_arr =
734            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]);
735        assert_eq!(binary_arr.len(), 2);
736        assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world"));
737        assert_eq!(
738            binary_arr.scalar_at(1),
739            Scalar::from("hello world this is a long string")
740        );
741    }
742
743    #[test]
744    pub fn slice_array() {
745        let binary_arr =
746            VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"])
747                .slice(1, 2);
748        assert_eq!(
749            binary_arr.scalar_at(0),
750            Scalar::from("hello world this is a long string")
751        );
752    }
753
754    #[test]
755    pub fn flatten_array() {
756        let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]);
757
758        let flattened = binary_arr.to_canonical().unwrap();
759        assert!(matches!(flattened, Canonical::VarBinView(_)));
760
761        let var_bin = flattened.into_varbinview().unwrap().into_array();
762        assert_eq!(var_bin.scalar_at(0), Scalar::from("string1"));
763        assert_eq!(var_bin.scalar_at(1), Scalar::from("string2"));
764    }
765
766    #[test]
767    pub fn binary_view_size_and_alignment() {
768        assert_eq!(size_of::<BinaryView>(), 16);
769        assert_eq!(align_of::<BinaryView>(), 16);
770    }
771}