vortex_array/arrays/varbinview/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use vortex_buffer::{Buffer, ByteBuffer};
7use vortex_dtype::{DType, Nullability};
8use vortex_error::{
9    VortexExpect, VortexResult, vortex_bail, vortex_ensure, vortex_err, vortex_panic,
10};
11
12use crate::arrays::binary_view::BinaryView;
13use crate::builders::{ArrayBuilder, VarBinViewBuilder};
14use crate::stats::ArrayStats;
15use crate::validity::Validity;
16
17/// A variable-length binary view array that stores strings and binary data efficiently.
18///
19/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides
20/// an optimized representation for variable-length data with excellent performance
21/// characteristics for both short and long strings.
22///
23/// ## Data Layout
24///
25/// The array uses a hybrid storage approach with two main components:
26/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element)
27/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes
28///
29/// ## View Structure
30///
31/// Commonly referred to as "German Strings", each 16-byte view entry contains either:
32/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view
33/// - **Reference data**: For strings > 12 bytes, contains:
34///   - String length (4 bytes)
35///   - First 4 bytes of string as prefix (4 bytes)
36///   - Buffer index and offset (8 bytes total)
37///
38/// The following ASCII graphic is reproduced verbatim from the Arrow documentation:
39///
40/// ```text
41///                         ┌──────┬────────────────────────┐
42///                         │length│      string value      │
43///    Strings (len <= 12)  │      │    (padded with 0)     │
44///                         └──────┴────────────────────────┘
45///                          0    31                      127
46///
47///                         ┌───────┬───────┬───────┬───────┐
48///                         │length │prefix │  buf  │offset │
49///    Strings (len > 12)   │       │       │ index │       │
50///                         └───────┴───────┴───────┴───────┘
51///                          0    31       63      95    127
52/// ```
53///
54/// # Examples
55///
56/// ```
57/// use vortex_array::arrays::VarBinViewArray;
58/// use vortex_dtype::{DType, Nullability};
59/// use vortex_array::IntoArray;
60///
61/// // Create from an Iterator<Item = &str>
62/// let array = VarBinViewArray::from_iter_str([
63///         "inlined",
64///         "this string is outlined"
65/// ]);
66///
67/// assert_eq!(array.len(), 2);
68///
69/// // Access individual strings
70/// let first = array.bytes_at(0);
71/// assert_eq!(first.as_slice(), b"inlined"); // "short"
72///
73/// let second = array.bytes_at(1);
74/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string
75/// ```
76#[derive(Clone, Debug)]
77pub struct VarBinViewArray {
78    pub(super) dtype: DType,
79    pub(super) buffers: Arc<[ByteBuffer]>,
80    pub(super) views: Buffer<BinaryView>,
81    pub(super) validity: Validity,
82    pub(super) stats_set: ArrayStats,
83}
84
85impl VarBinViewArray {
86    /// Creates a new [`VarBinViewArray`].
87    ///
88    /// # Panics
89    ///
90    /// Panics if the provided components do not satisfy the invariants documented
91    /// in [`VarBinViewArray::new_unchecked`].
92    pub fn new(
93        views: Buffer<BinaryView>,
94        buffers: Arc<[ByteBuffer]>,
95        dtype: DType,
96        validity: Validity,
97    ) -> Self {
98        Self::try_new(views, buffers, dtype, validity)
99            .vortex_expect("VarBinViewArray construction failed")
100    }
101
102    /// Constructs a new `VarBinViewArray`.
103    ///
104    /// See [`VarBinViewArray::new_unchecked`] for more information.
105    ///
106    /// # Errors
107    ///
108    /// Returns an error if the provided components do not satisfy the invariants documented in
109    /// [`VarBinViewArray::new_unchecked`].
110    pub fn try_new(
111        views: Buffer<BinaryView>,
112        buffers: Arc<[ByteBuffer]>,
113        dtype: DType,
114        validity: Validity,
115    ) -> VortexResult<Self> {
116        Self::validate(&views, &buffers, &dtype, &validity)?;
117
118        // SAFETY: validate ensures all invariants are met.
119        Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
120    }
121
122    /// Creates a new [`VarBinViewArray`] without validation from these components:
123    ///
124    /// * `views` is a buffer of 16-byte view entries (one per logical element).
125    /// * `buffers` contains the backing storage for strings longer than 12 bytes.
126    /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
127    /// * `validity` holds the null values.
128    ///
129    /// # Safety
130    ///
131    /// The caller must ensure all of the following invariants are satisfied:
132    ///
133    /// ## View Requirements
134    ///
135    /// - Views must be properly formatted 16-byte [`BinaryView`] entries.
136    /// - Inlined views (length ≤ 12) must have valid data in the first `length` bytes.
137    /// - Reference views (length > 12) must:
138    ///   - Have a valid buffer index < `buffers.len()`.
139    ///   - Have valid offsets that don't exceed the referenced buffer's bounds.
140    ///   - Have a 4-byte prefix that matches the actual data at the referenced location.
141    ///
142    /// ## Type Requirements
143    ///
144    /// - `dtype` must be either [`DType::Utf8`] or [`DType::Binary`].
145    /// - For [`DType::Utf8`], all string data (both inlined and referenced) must be valid UTF-8.
146    ///
147    /// ## Validity Requirements
148    ///
149    /// - The validity must have the same nullability as the dtype.
150    /// - If validity is an array, its length must match `views.len()`.
151    pub unsafe fn new_unchecked(
152        views: Buffer<BinaryView>,
153        buffers: Arc<[ByteBuffer]>,
154        dtype: DType,
155        validity: Validity,
156    ) -> Self {
157        #[cfg(debug_assertions)]
158        Self::validate(&views, &buffers, &dtype, &validity)
159            .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
160
161        Self {
162            dtype,
163            buffers,
164            views,
165            validity,
166            stats_set: Default::default(),
167        }
168    }
169
170    /// Validates the components that would be used to create a [`VarBinViewArray`].
171    ///
172    /// This function checks all the invariants required by [`VarBinViewArray::new_unchecked`].
173    pub fn validate(
174        views: &Buffer<BinaryView>,
175        buffers: &Arc<[ByteBuffer]>,
176        dtype: &DType,
177        validity: &Validity,
178    ) -> VortexResult<()> {
179        vortex_ensure!(
180            validity.nullability() == dtype.nullability(),
181            "validity {:?} incompatible with nullability {:?}",
182            validity,
183            dtype.nullability()
184        );
185
186        match dtype {
187            DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
188                simdutf8::basic::from_utf8(string).is_ok()
189            })?,
190            DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
191            _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),
192        }
193
194        Ok(())
195    }
196
197    fn validate_views<F>(
198        views: &Buffer<BinaryView>,
199        buffers: &Arc<[ByteBuffer]>,
200        validity: &Validity,
201        validator: F,
202    ) -> VortexResult<()>
203    where
204        F: Fn(&[u8]) -> bool,
205    {
206        for (idx, &view) in views.iter().enumerate() {
207            if validity.is_null(idx) {
208                continue;
209            }
210
211            if view.is_inlined() {
212                // Validate the inline bytestring
213                let bytes = &unsafe { view.inlined }.data[..view.len() as usize];
214                vortex_ensure!(
215                    validator(bytes),
216                    "view at index {idx}: inlined bytes failed utf-8 validation"
217                );
218            } else {
219                // Validate the view pointer
220                let view = view.as_view();
221                let buf_index = view.buffer_index as usize;
222                let start_offset = view.offset as usize;
223                let end_offset = start_offset.saturating_add(view.size as usize);
224
225                let buf = buffers.get(buf_index).ok_or_else(||
226                    vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
227                        buffers.len()))?;
228
229                vortex_ensure!(
230                    start_offset < buf.len(),
231                    "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
232                    buf.len(),
233                );
234
235                vortex_ensure!(
236                    end_offset <= buf.len(),
237                    "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
238                    buf.len(),
239                );
240
241                // Make sure the prefix data matches the buffer data.
242                let bytes = &buf[start_offset..end_offset];
243                vortex_ensure!(
244                    view.prefix == bytes[..4],
245                    "VarBinView prefix does not match full string"
246                );
247
248                // Validate the full string
249                vortex_ensure!(
250                    validator(bytes),
251                    "view at index {idx}: outlined bytes fails utf-8 validation"
252                );
253            }
254        }
255
256        Ok(())
257    }
258
259    /// Number of raw string data buffers held by this array.
260    pub fn nbuffers(&self) -> usize {
261        self.buffers.len()
262    }
263
264    /// Access to the primitive views buffer.
265    ///
266    /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that
267    /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of
268    /// the string (if the string has 12 bytes or fewer).
269    #[inline]
270    pub fn views(&self) -> &Buffer<BinaryView> {
271        &self.views
272    }
273
274    /// Access value bytes at a given index
275    ///
276    /// Will return a `ByteBuffer` containing the data without performing a copy.
277    #[inline]
278    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
279        let views = self.views();
280        let view = &views[index];
281        // Expect this to be the common case: strings > 12 bytes.
282        if !view.is_inlined() {
283            let view_ref = view.as_view();
284            self.buffer(view_ref.buffer_index() as usize)
285                .slice(view_ref.as_range())
286        } else {
287            // Return access to the range of bytes around it.
288            views
289                .clone()
290                .into_byte_buffer()
291                .slice_ref(view.as_inlined().value())
292        }
293    }
294
295    /// Access one of the backing data buffers.
296    ///
297    /// # Panics
298    ///
299    /// This method panics if the provided index is out of bounds for the set of buffers provided
300    /// at construction time.
301    #[inline]
302    pub fn buffer(&self, idx: usize) -> &ByteBuffer {
303        if idx >= self.nbuffers() {
304            vortex_panic!(
305                "{idx} buffer index out of bounds, there are {} buffers",
306                self.nbuffers()
307            );
308        }
309        &self.buffers[idx]
310    }
311
312    /// Iterate over the underlying raw data buffers, not including the views buffer.
313    #[inline]
314    pub fn buffers(&self) -> &Arc<[ByteBuffer]> {
315        &self.buffers
316    }
317
318    /// Accumulate an iterable set of values into our type here.
319    #[allow(clippy::same_name_method)]
320    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
321        iter: I,
322        dtype: DType,
323    ) -> Self {
324        let iter = iter.into_iter();
325        let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
326
327        for item in iter {
328            match item {
329                None => builder.append_null(),
330                Some(v) => builder.append_value(v),
331            }
332        }
333
334        builder.finish_into_varbinview()
335    }
336
337    pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
338        let iter = iter.into_iter();
339        let mut builder = VarBinViewBuilder::with_capacity(
340            DType::Utf8(Nullability::NonNullable),
341            iter.size_hint().0,
342        );
343
344        for item in iter {
345            builder.append_value(item.as_ref());
346        }
347
348        builder.finish_into_varbinview()
349    }
350
351    pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
352        iter: I,
353    ) -> Self {
354        let iter = iter.into_iter();
355        let mut builder = VarBinViewBuilder::with_capacity(
356            DType::Utf8(Nullability::Nullable),
357            iter.size_hint().0,
358        );
359
360        for item in iter {
361            match item {
362                None => builder.append_null(),
363                Some(v) => builder.append_value(v.as_ref()),
364            }
365        }
366
367        builder.finish_into_varbinview()
368    }
369
370    pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
371        let iter = iter.into_iter();
372        let mut builder = VarBinViewBuilder::with_capacity(
373            DType::Binary(Nullability::NonNullable),
374            iter.size_hint().0,
375        );
376
377        for item in iter {
378            builder.append_value(item.as_ref());
379        }
380
381        builder.finish_into_varbinview()
382    }
383
384    pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
385        iter: I,
386    ) -> Self {
387        let iter = iter.into_iter();
388        let mut builder = VarBinViewBuilder::with_capacity(
389            DType::Binary(Nullability::Nullable),
390            iter.size_hint().0,
391        );
392
393        for item in iter {
394            match item {
395                None => builder.append_null(),
396                Some(v) => builder.append_value(v.as_ref()),
397            }
398        }
399
400        builder.finish_into_varbinview()
401    }
402}
403
404impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
405    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
406        Self::from_iter_nullable_bin(iter)
407    }
408}
409
410impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
411    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
412        Self::from_iter_nullable_bin(iter)
413    }
414}
415
416impl FromIterator<Option<String>> for VarBinViewArray {
417    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
418        Self::from_iter_nullable_str(iter)
419    }
420}
421
422impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
423    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
424        Self::from_iter_nullable_str(iter)
425    }
426}