Skip to main content

vortex_array/arrays/varbinview/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_buffer::Alignment;
7use vortex_buffer::Buffer;
8use vortex_buffer::ByteBuffer;
9use vortex_dtype::DType;
10use vortex_dtype::Nullability;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_error::vortex_ensure;
15use vortex_error::vortex_err;
16use vortex_error::vortex_panic;
17
18use crate::arrays::BinaryView;
19use crate::buffer::BufferHandle;
20use crate::builders::ArrayBuilder;
21use crate::builders::VarBinViewBuilder;
22use crate::stats::ArrayStats;
23use crate::validity::Validity;
24
25/// A variable-length binary view array that stores strings and binary data efficiently.
26///
27/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
28/// an optimized representation for variable-length data with excellent performance
29/// characteristics for both short and long strings.
30///
31/// ## Data Layout
32///
33/// The array uses a hybrid storage approach with two main components:
34/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
35/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
36///
37/// ## View Structure
38///
39/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
40/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
41/// - **Reference data**: For strings > 12 bytes, contains:
42///   - String length (4 bytes)
43///   - First 4 bytes of string as prefix (4 bytes)
44///   - Buffer index and offset (8 bytes total)
45///
46/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
47///
48/// ```text
49///                         ┌──────┬────────────────────────┐
50///                         │length│      string value      │
51///    Strings (len <= 12)  │      │    (padded with 0)     │
52///                         └──────┴────────────────────────┘
53///                          0    31                      127
54///
55///                         ┌───────┬───────┬───────┬───────┐
56///                         │length │prefix │  buf  │offset │
57///    Strings (len > 12)   │       │       │ index │       │
58///                         └───────┴───────┴───────┴───────┘
59///                          0    31       63      95    127
60/// ```
61///
62/// # Examples
63///
64/// ```
65/// use vortex_array::arrays::VarBinViewArray;
66/// use vortex_dtype::{DType, Nullability};
67/// use vortex_array::IntoArray;
68///
69/// // Create from an Iterator<Item = &str>
70/// let array = VarBinViewArray::from_iter_str([
71///         "inlined",
72///         "this string is outlined"
73/// ]);
74///
75/// assert_eq!(array.len(), 2);
76///
77/// // Access individual strings
78/// let first = array.bytes_at(0);
79/// assert_eq!(first.as_slice(), b"inlined"); // "short"
80///
81/// let second = array.bytes_at(1);
82/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
83/// ```
84#[derive(Clone, Debug)]
85pub struct VarBinViewArray {
86    pub(super) dtype: DType,
87    pub(super) buffers: Arc<[BufferHandle]>,
88    pub(super) views: BufferHandle,
89    pub(super) validity: Validity,
90    pub(super) stats_set: ArrayStats,
91}
92
93pub struct VarBinViewArrayParts {
94    pub dtype: DType,
95    pub buffers: Arc<[BufferHandle]>,
96    pub views: BufferHandle,
97    pub validity: Validity,
98}
99
100impl VarBinViewArray {
101    /// Creates a new [`VarBinViewArray`].
102    ///
103    /// # Panics
104    ///
105    /// Panics if the provided components do not satisfy the invariants documented
106    /// in [`VarBinViewArray::new_unchecked`].
107    pub fn new(
108        views: Buffer<BinaryView>,
109        buffers: Arc<[ByteBuffer]>,
110        dtype: DType,
111        validity: Validity,
112    ) -> Self {
113        Self::try_new(views, buffers, dtype, validity)
114            .vortex_expect("VarBinViewArray construction failed")
115    }
116
117    /// Creates a new [`VarBinViewArray`] with device or host memory.
118    ///
119    /// # Panics
120    ///
121    /// Panics if the provided components do not satisfy the invariants documented
122    /// in [`VarBinViewArray::new_unchecked`].
123    pub fn new_handle(
124        views: BufferHandle,
125        buffers: Arc<[BufferHandle]>,
126        dtype: DType,
127        validity: Validity,
128    ) -> Self {
129        Self::try_new_handle(views, buffers, dtype, validity)
130            .vortex_expect("VarbinViewArray construction failed")
131    }
132
133    /// Constructs a new `VarBinViewArray`.
134    ///
135    /// See [`VarBinViewArray::new_unchecked`] for more information.
136    ///
137    /// # Errors
138    ///
139    /// Returns an error if the provided components do not satisfy the invariants documented in
140    /// [`VarBinViewArray::new_unchecked`].
141    pub fn try_new(
142        views: Buffer<BinaryView>,
143        buffers: Arc<[ByteBuffer]>,
144        dtype: DType,
145        validity: Validity,
146    ) -> VortexResult<Self> {
147        Self::validate(&views, &buffers, &dtype, &validity)?;
148
149        // SAFETY: validate ensures all invariants are met.
150        Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
151    }
152
153    /// Constructs a new `VarBinViewArray`.
154    ///
155    /// See [`VarBinViewArray::new_unchecked`] for more information.
156    ///
157    /// # Errors
158    ///
159    /// Returns an error if the provided components do not satisfy the invariants documented in
160    /// [`VarBinViewArray::new_unchecked`].
161    pub fn try_new_handle(
162        views: BufferHandle,
163        buffers: Arc<[BufferHandle]>,
164        dtype: DType,
165        validity: Validity,
166    ) -> VortexResult<Self> {
167        let views_nbytes = views.len();
168        vortex_ensure!(
169            views_nbytes.is_multiple_of(size_of::<BinaryView>()),
170            "Expected views buffer length ({views_nbytes}) to be a multiple of {}",
171            size_of::<BinaryView>()
172        );
173
174        // TODO(aduffy): device validation.
175        if let Some(host) = views.as_host_opt() {
176            vortex_ensure!(
177                host.is_aligned(Alignment::of::<BinaryView>()),
178                "Views on host must be 16 byte aligned"
179            );
180        }
181
182        // SAFETY: validate ensures all invariants are met.
183        Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
184    }
185
186    /// Creates a new [`VarBinViewArray`] without validation from these components:
187    ///
188    /// * `views` is a buffer of 16-byte view entries (one per logical element).
189    /// * `buffers` contains the backing storage for strings longer than 12 bytes.
190    /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
191    /// * `validity` holds the null values.
192    ///
193    /// # Safety
194    ///
195    /// The caller must ensure all of the following invariants are satisfied:
196    ///
197    /// ## View Requirements
198    ///
199    /// - Views must be properly formatted 16-byte [`BinaryView`] entries.
200    /// - Inlined views (length ≤ 12) must have valid data in the first `length` bytes.
201    /// - Reference views (length > 12) must:
202    ///   - Have a valid buffer index < `buffers.len()`.
203    ///   - Have valid offsets that don't exceed the referenced buffer's bounds.
204    ///   - Have a 4-byte prefix that matches the actual data at the referenced location.
205    ///
206    /// ## Type Requirements
207    ///
208    /// - `dtype` must be either [`DType::Utf8`] or [`DType::Binary`].
209    /// - For [`DType::Utf8`], all string data (both inlined and referenced) must be valid UTF-8.
210    ///
211    /// ## Validity Requirements
212    ///
213    /// - The validity must have the same nullability as the dtype.
214    /// - If validity is an array, its length must match `views.len()`.
215    pub unsafe fn new_unchecked(
216        views: Buffer<BinaryView>,
217        buffers: Arc<[ByteBuffer]>,
218        dtype: DType,
219        validity: Validity,
220    ) -> Self {
221        #[cfg(debug_assertions)]
222        Self::validate(&views, &buffers, &dtype, &validity)
223            .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
224
225        let handles: Vec<BufferHandle> = buffers
226            .iter()
227            .cloned()
228            .map(BufferHandle::new_host)
229            .collect();
230
231        let handles = Arc::from(handles);
232        let view_handle = BufferHandle::new_host(views.into_byte_buffer());
233        unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
234    }
235
236    /// Construct a new array from `BufferHandle`s without validation.
237    ///
238    /// # Safety
239    ///
240    /// See documentation in `new_unchecked`.
241    pub unsafe fn new_handle_unchecked(
242        views: BufferHandle,
243        buffers: Arc<[BufferHandle]>,
244        dtype: DType,
245        validity: Validity,
246    ) -> Self {
247        Self {
248            views,
249            buffers,
250            dtype,
251            validity,
252            stats_set: Default::default(),
253        }
254    }
255
256    /// Validates the components that would be used to create a [`VarBinViewArray`].
257    ///
258    /// This function checks all the invariants required by [`VarBinViewArray::new_unchecked`].
259    pub fn validate(
260        views: &Buffer<BinaryView>,
261        buffers: &Arc<[ByteBuffer]>,
262        dtype: &DType,
263        validity: &Validity,
264    ) -> VortexResult<()> {
265        vortex_ensure!(
266            validity.nullability() == dtype.nullability(),
267            InvalidArgument: "validity {:?} incompatible with nullability {:?}",
268            validity,
269            dtype.nullability()
270        );
271
272        match dtype {
273            DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
274                simdutf8::basic::from_utf8(string).is_ok()
275            })?,
276            DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
277            _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
278        }
279
280        Ok(())
281    }
282
283    fn validate_views<F>(
284        views: &Buffer<BinaryView>,
285        buffers: &Arc<[ByteBuffer]>,
286        validity: &Validity,
287        validator: F,
288    ) -> VortexResult<()>
289    where
290        F: Fn(&[u8]) -> bool,
291    {
292        for (idx, &view) in views.iter().enumerate() {
293            if validity.is_null(idx)? {
294                continue;
295            }
296
297            if view.is_inlined() {
298                // Validate the inline bytestring
299                let bytes = &view.as_inlined().data[..view.len() as usize];
300                vortex_ensure!(
301                    validator(bytes),
302                    InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
303                );
304            } else {
305                // Validate the view pointer
306                let view = view.as_view();
307                let buf_index = view.buffer_index as usize;
308                let start_offset = view.offset as usize;
309                let end_offset = start_offset.saturating_add(view.size as usize);
310
311                let buf = buffers.get(buf_index).ok_or_else(||
312                    vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
313                        buffers.len()))?;
314
315                vortex_ensure!(
316                    start_offset < buf.len(),
317                    InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
318                    buf.len(),
319                );
320
321                vortex_ensure!(
322                    end_offset <= buf.len(),
323                    InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
324                    buf.len(),
325                );
326
327                // Make sure the prefix data matches the buffer data.
328                let bytes = &buf[start_offset..end_offset];
329                vortex_ensure!(
330                    view.prefix == bytes[..4],
331                    InvalidArgument: "VarBinView prefix does not match full string"
332                );
333
334                // Validate the full string
335                vortex_ensure!(
336                    validator(bytes),
337                    InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
338                );
339            }
340        }
341
342        Ok(())
343    }
344
345    /// Splits the array into owned parts
346    pub fn into_parts(self) -> VarBinViewArrayParts {
347        VarBinViewArrayParts {
348            dtype: self.dtype,
349            buffers: self.buffers,
350            views: self.views,
351            validity: self.validity,
352        }
353    }
354
355    /// Number of raw string data buffers held by this array.
356    pub fn nbuffers(&self) -> usize {
357        self.buffers.len()
358    }
359
360    /// Access to the primitive views buffer.
361    ///
362    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
363    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
364    /// the string (if the string has 12 bytes or fewer).
365    #[inline]
366    pub fn views(&self) -> &[BinaryView] {
367        let host_views = self.views.as_host();
368        let len = host_views.len() / size_of::<BinaryView>();
369
370        // SAFETY: data alignment is checked for host buffers on construction
371        unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
372    }
373
374    /// Return the buffer handle backing the views.
375    pub fn views_handle(&self) -> &BufferHandle {
376        &self.views
377    }
378
379    /// Access value bytes at a given index
380    ///
381    /// Will return a `ByteBuffer` containing the data without performing a copy.
382    #[inline]
383    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
384        let views = self.views();
385        let view = &views[index];
386        // Expect this to be the common case: strings > 12 bytes.
387        if !view.is_inlined() {
388            let view_ref = view.as_view();
389            self.buffer(view_ref.buffer_index as usize)
390                .slice(view_ref.as_range())
391        } else {
392            // Return access to the range of bytes around it.
393            self.views_handle()
394                .as_host()
395                .clone()
396                .into_byte_buffer()
397                .slice_ref(view.as_inlined().value())
398        }
399    }
400
401    /// Access one of the backing data buffers.
402    ///
403    /// # Panics
404    ///
405    /// This method panics if the provided index is out of bounds for the set of buffers provided
406    /// at construction time.
407    #[inline]
408    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
409        if idx >= self.nbuffers() {
410            vortex_panic!(
411                "{idx} buffer index out of bounds, there are {} buffers",
412                self.nbuffers()
413            );
414        }
415        self.buffers[idx].as_host()
416    }
417
418    /// Iterate over the underlying raw data buffers, not including the views buffer.
419    #[inline]
420    pub fn buffers(&self) -> &Arc<[BufferHandle]> {
421        &self.buffers
422    }
423
424    /// Accumulate an iterable set of values into our type here.
425    #[expect(
426        clippy::same_name_method,
427        reason = "intentionally named from_iter like Iterator::from_iter"
428    )]
429    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
430        iter: I,
431        dtype: DType,
432    ) -> Self {
433        let iter = iter.into_iter();
434        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
435
436        for item in iter {
437            match item {
438                None => builder.append_null(),
439                Some(v) => builder.append_value(v),
440            }
441        }
442
443        builder.finish_into_varbinview()
444    }
445
446    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
447        let iter = iter.into_iter();
448        let mut builder = VarBinViewBuilder::with_capacity(
449            DType::Utf8(Nullability::NonNullable),
450            iter.size_hint().0,
451        );
452
453        for item in iter {
454            builder.append_value(item.as_ref());
455        }
456
457        builder.finish_into_varbinview()
458    }
459
460    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
461        iter: I,
462    ) -> Self {
463        let iter = iter.into_iter();
464        let mut builder = VarBinViewBuilder::with_capacity(
465            DType::Utf8(Nullability::Nullable),
466            iter.size_hint().0,
467        );
468
469        for item in iter {
470            match item {
471                None => builder.append_null(),
472                Some(v) => builder.append_value(v.as_ref()),
473            }
474        }
475
476        builder.finish_into_varbinview()
477    }
478
479    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
480        let iter = iter.into_iter();
481        let mut builder = VarBinViewBuilder::with_capacity(
482            DType::Binary(Nullability::NonNullable),
483            iter.size_hint().0,
484        );
485
486        for item in iter {
487            builder.append_value(item.as_ref());
488        }
489
490        builder.finish_into_varbinview()
491    }
492
493    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
494        iter: I,
495    ) -> Self {
496        let iter = iter.into_iter();
497        let mut builder = VarBinViewBuilder::with_capacity(
498            DType::Binary(Nullability::Nullable),
499            iter.size_hint().0,
500        );
501
502        for item in iter {
503            match item {
504                None => builder.append_null(),
505                Some(v) => builder.append_value(v.as_ref()),
506            }
507        }
508
509        builder.finish_into_varbinview()
510    }
511}
512
513impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
514    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
515        Self::from_iter_nullable_bin(iter)
516    }
517}
518
519impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
520    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
521        Self::from_iter_nullable_bin(iter)
522    }
523}
524
525impl FromIterator<Option<String>> for VarBinViewArray {
526    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
527        Self::from_iter_nullable_str(iter)
528    }
529}
530
531impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
532    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
533        Self::from_iter_nullable_str(iter)
534    }
535}