vortex_array/arrays/varbinview/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_buffer::Buffer;
7use vortex_buffer::ByteBuffer;
8use vortex_dtype::DType;
9use vortex_dtype::Nullability;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_err;
15use vortex_error::vortex_panic;
16use vortex_vector::binaryview::BinaryView;
17
18use crate::builders::ArrayBuilder;
19use crate::builders::VarBinViewBuilder;
20use crate::stats::ArrayStats;
21use crate::validity::Validity;
22
23/// A variable-length binary view array that stores strings and binary data efficiently.
24///
25/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
26/// an optimized representation for variable-length data with excellent performance
27/// characteristics for both short and long strings.
28///
29/// ## Data Layout
30///
31/// The array uses a hybrid storage approach with two main components:
32/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
33/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
34///
35/// ## View Structure
36///
37/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
38/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
39/// - **Reference data**: For strings > 12 bytes, contains:
40///   - String length (4 bytes)
41///   - First 4 bytes of string as prefix (4 bytes)
42///   - Buffer index and offset (8 bytes total)
43///
44/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
45///
46/// ```text
47///                         ┌──────┬────────────────────────┐
48///                         │length│      string value      │
49///    Strings (len <= 12)  │      │    (padded with 0)     │
50///                         └──────┴────────────────────────┘
51///                          0    31                      127
52///
53///                         ┌───────┬───────┬───────┬───────┐
54///                         │length │prefix │  buf  │offset │
55///    Strings (len > 12)   │       │       │ index │       │
56///                         └───────┴───────┴───────┴───────┘
57///                          0    31       63      95    127
58/// ```
59///
60/// # Examples
61///
62/// ```
63/// use vortex_array::arrays::VarBinViewArray;
64/// use vortex_dtype::{DType, Nullability};
65/// use vortex_array::IntoArray;
66///
67/// // Create from an Iterator<Item = &str>
68/// let array = VarBinViewArray::from_iter_str([
69///         "inlined",
70///         "this string is outlined"
71/// ]);
72///
73/// assert_eq!(array.len(), 2);
74///
75/// // Access individual strings
76/// let first = array.bytes_at(0);
77/// assert_eq!(first.as_slice(), b"inlined"); // "short"
78///
79/// let second = array.bytes_at(1);
80/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
81/// ```
82#[derive(Clone, Debug)]
83pub struct VarBinViewArray {
84    pub(super) dtype: DType,
85    pub(super) buffers: Arc<[ByteBuffer]>,
86    pub(super) views: Buffer<BinaryView>,
87    pub(super) validity: Validity,
88    pub(super) stats_set: ArrayStats,
89}
90
91impl VarBinViewArray {
92    /// Creates a new [`VarBinViewArray`].
93    ///
94    /// # Panics
95    ///
96    /// Panics if the provided components do not satisfy the invariants documented
97    /// in [`VarBinViewArray::new_unchecked`].
98    pub fn new(
99        views: Buffer<BinaryView>,
100        buffers: Arc<[ByteBuffer]>,
101        dtype: DType,
102        validity: Validity,
103    ) -> Self {
104        Self::try_new(views, buffers, dtype, validity)
105            .vortex_expect("VarBinViewArray construction failed")
106    }
107
108    /// Constructs a new `VarBinViewArray`.
109    ///
110    /// See [`VarBinViewArray::new_unchecked`] for more information.
111    ///
112    /// # Errors
113    ///
114    /// Returns an error if the provided components do not satisfy the invariants documented in
115    /// [`VarBinViewArray::new_unchecked`].
116    pub fn try_new(
117        views: Buffer<BinaryView>,
118        buffers: Arc<[ByteBuffer]>,
119        dtype: DType,
120        validity: Validity,
121    ) -> VortexResult<Self> {
122        Self::validate(&views, &buffers, &dtype, &validity)?;
123
124        // SAFETY: validate ensures all invariants are met.
125        Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
126    }
127
128    /// Creates a new [`VarBinViewArray`] without validation from these components:
129    ///
130    /// * `views` is a buffer of 16-byte view entries (one per logical element).
131    /// * `buffers` contains the backing storage for strings longer than 12 bytes.
132    /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
133    /// * `validity` holds the null values.
134    ///
135    /// # Safety
136    ///
137    /// The caller must ensure all of the following invariants are satisfied:
138    ///
139    /// ## View Requirements
140    ///
141    /// - Views must be properly formatted 16-byte [`BinaryView`] entries.
142    /// - Inlined views (length ≤ 12) must have valid data in the first `length` bytes.
143    /// - Reference views (length > 12) must:
144    ///   - Have a valid buffer index < `buffers.len()`.
145    ///   - Have valid offsets that don't exceed the referenced buffer's bounds.
146    ///   - Have a 4-byte prefix that matches the actual data at the referenced location.
147    ///
148    /// ## Type Requirements
149    ///
150    /// - `dtype` must be either [`DType::Utf8`] or [`DType::Binary`].
151    /// - For [`DType::Utf8`], all string data (both inlined and referenced) must be valid UTF-8.
152    ///
153    /// ## Validity Requirements
154    ///
155    /// - The validity must have the same nullability as the dtype.
156    /// - If validity is an array, its length must match `views.len()`.
157    pub unsafe fn new_unchecked(
158        views: Buffer<BinaryView>,
159        buffers: Arc<[ByteBuffer]>,
160        dtype: DType,
161        validity: Validity,
162    ) -> Self {
163        #[cfg(debug_assertions)]
164        Self::validate(&views, &buffers, &dtype, &validity)
165            .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
166
167        Self {
168            dtype,
169            buffers,
170            views,
171            validity,
172            stats_set: Default::default(),
173        }
174    }
175
176    /// Validates the components that would be used to create a [`VarBinViewArray`].
177    ///
178    /// This function checks all the invariants required by [`VarBinViewArray::new_unchecked`].
179    pub fn validate(
180        views: &Buffer<BinaryView>,
181        buffers: &Arc<[ByteBuffer]>,
182        dtype: &DType,
183        validity: &Validity,
184    ) -> VortexResult<()> {
185        vortex_ensure!(
186            validity.nullability() == dtype.nullability(),
187            "validity {:?} incompatible with nullability {:?}",
188            validity,
189            dtype.nullability()
190        );
191
192        match dtype {
193            DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
194                simdutf8::basic::from_utf8(string).is_ok()
195            })?,
196            DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
197            _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
198        }
199
200        Ok(())
201    }
202
203    fn validate_views<F>(
204        views: &Buffer<BinaryView>,
205        buffers: &Arc<[ByteBuffer]>,
206        validity: &Validity,
207        validator: F,
208    ) -> VortexResult<()>
209    where
210        F: Fn(&[u8]) -> bool,
211    {
212        for (idx, &view) in views.iter().enumerate() {
213            if validity.is_null(idx) {
214                continue;
215            }
216
217            if view.is_inlined() {
218                // Validate the inline bytestring
219                let bytes = &view.as_inlined().data[..view.len() as usize];
220                vortex_ensure!(
221                    validator(bytes),
222                    "view at index {idx}: inlined bytes failed utf-8 validation"
223                );
224            } else {
225                // Validate the view pointer
226                let view = view.as_view();
227                let buf_index = view.buffer_index as usize;
228                let start_offset = view.offset as usize;
229                let end_offset = start_offset.saturating_add(view.size as usize);
230
231                let buf = buffers.get(buf_index).ok_or_else(||
232                    vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
233                        buffers.len()))?;
234
235                vortex_ensure!(
236                    start_offset < buf.len(),
237                    "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
238                    buf.len(),
239                );
240
241                vortex_ensure!(
242                    end_offset <= buf.len(),
243                    "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
244                    buf.len(),
245                );
246
247                // Make sure the prefix data matches the buffer data.
248                let bytes = &buf[start_offset..end_offset];
249                vortex_ensure!(
250                    view.prefix == bytes[..4],
251                    "VarBinView prefix does not match full string"
252                );
253
254                // Validate the full string
255                vortex_ensure!(
256                    validator(bytes),
257                    "view at index {idx}: outlined bytes fails utf-8 validation"
258                );
259            }
260        }
261
262        Ok(())
263    }
264
265    /// Number of raw string data buffers held by this array.
266    pub fn nbuffers(&self) -> usize {
267        self.buffers.len()
268    }
269
270    /// Access to the primitive views buffer.
271    ///
272    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
273    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
274    /// the string (if the string has 12 bytes or fewer).
275    #[inline]
276    pub fn views(&self) -> &Buffer<BinaryView> {
277        &self.views
278    }
279
280    /// Access value bytes at a given index
281    ///
282    /// Will return a `ByteBuffer` containing the data without performing a copy.
283    #[inline]
284    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
285        let views = self.views();
286        let view = &views[index];
287        // Expect this to be the common case: strings > 12 bytes.
288        if !view.is_inlined() {
289            let view_ref = view.as_view();
290            self.buffer(view_ref.buffer_index as usize)
291                .slice(view_ref.as_range())
292        } else {
293            // Return access to the range of bytes around it.
294            views
295                .clone()
296                .into_byte_buffer()
297                .slice_ref(view.as_inlined().value())
298        }
299    }
300
301    /// Access one of the backing data buffers.
302    ///
303    /// # Panics
304    ///
305    /// This method panics if the provided index is out of bounds for the set of buffers provided
306    /// at construction time.
307    #[inline]
308    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
309        if idx >= self.nbuffers() {
310            vortex_panic!(
311                "{idx} buffer index out of bounds, there are {} buffers",
312                self.nbuffers()
313            );
314        }
315        &self.buffers[idx]
316    }
317
318    /// Iterate over the underlying raw data buffers, not including the views buffer.
319    #[inline]
320    pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
321        &self.buffers
322    }
323
324    /// Accumulate an iterable set of values into our type here.
325    #[expect(
326        clippy::same_name_method,
327        reason = "intentionally named from_iter like Iterator::from_iter"
328    )]
329    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
330        iter: I,
331        dtype: DType,
332    ) -> Self {
333        let iter = iter.into_iter();
334        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
335
336        for item in iter {
337            match item {
338                None => builder.append_null(),
339                Some(v) => builder.append_value(v),
340            }
341        }
342
343        builder.finish_into_varbinview()
344    }
345
346    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
347        let iter = iter.into_iter();
348        let mut builder = VarBinViewBuilder::with_capacity(
349            DType::Utf8(Nullability::NonNullable),
350            iter.size_hint().0,
351        );
352
353        for item in iter {
354            builder.append_value(item.as_ref());
355        }
356
357        builder.finish_into_varbinview()
358    }
359
360    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
361        iter: I,
362    ) -> Self {
363        let iter = iter.into_iter();
364        let mut builder = VarBinViewBuilder::with_capacity(
365            DType::Utf8(Nullability::Nullable),
366            iter.size_hint().0,
367        );
368
369        for item in iter {
370            match item {
371                None => builder.append_null(),
372                Some(v) => builder.append_value(v.as_ref()),
373            }
374        }
375
376        builder.finish_into_varbinview()
377    }
378
379    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
380        let iter = iter.into_iter();
381        let mut builder = VarBinViewBuilder::with_capacity(
382            DType::Binary(Nullability::NonNullable),
383            iter.size_hint().0,
384        );
385
386        for item in iter {
387            builder.append_value(item.as_ref());
388        }
389
390        builder.finish_into_varbinview()
391    }
392
393    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
394        iter: I,
395    ) -> Self {
396        let iter = iter.into_iter();
397        let mut builder = VarBinViewBuilder::with_capacity(
398            DType::Binary(Nullability::Nullable),
399            iter.size_hint().0,
400        );
401
402        for item in iter {
403            match item {
404                None => builder.append_null(),
405                Some(v) => builder.append_value(v.as_ref()),
406            }
407        }
408
409        builder.finish_into_varbinview()
410    }
411}
412
413impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
414    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
415        Self::from_iter_nullable_bin(iter)
416    }
417}
418
419impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
420    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
421        Self::from_iter_nullable_bin(iter)
422    }
423}
424
425impl FromIterator<Option<String>> for VarBinViewArray {
426    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
427        Self::from_iter_nullable_str(iter)
428    }
429}
430
431impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
432    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
433        Self::from_iter_nullable_str(iter)
434    }
435}