vortex_array/arrays/varbin/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::AsPrimitive;
5use vortex_buffer::ByteBuffer;
6use vortex_dtype::DType;
7use vortex_dtype::IntegerPType;
8use vortex_dtype::Nullability;
9use vortex_dtype::match_each_integer_ptype;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_err;
14
15use crate::Array;
16use crate::ArrayRef;
17use crate::ToCanonical;
18use crate::arrays::varbin::builder::VarBinBuilder;
19use crate::stats::ArrayStats;
20use crate::validity::Validity;
21
22#[derive(Clone, Debug)]
23pub struct VarBinArray {
24    pub(super) dtype: DType,
25    bytes: ByteBuffer,
26    offsets: ArrayRef,
27    pub(super) validity: Validity,
28    pub(super) stats_set: ArrayStats,
29}
30
31impl VarBinArray {
32    /// Creates a new [`VarBinArray`].
33    ///
34    /// # Panics
35    ///
36    /// Panics if the provided components do not satisfy the invariants documented
37    /// in [`VarBinArray::new_unchecked`].
38    pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
39        Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
40    }
41
42    /// Constructs a new `VarBinArray`.
43    ///
44    /// See [`VarBinArray::new_unchecked`] for more information.
45    ///
46    /// # Errors
47    ///
48    /// Returns an error if the provided components do not satisfy the invariants documented in
49    /// [`VarBinArray::new_unchecked`].
50    pub fn try_new(
51        offsets: ArrayRef,
52        bytes: ByteBuffer,
53        dtype: DType,
54        validity: Validity,
55    ) -> VortexResult<Self> {
56        Self::validate(&offsets, &bytes, &dtype, &validity)?;
57
58        // SAFETY: validate ensures all invariants are met.
59        Ok(unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) })
60    }
61
62    /// Creates a new [`VarBinArray`] without validation from these components:
63    ///
64    /// * `offsets` is an array of byte offsets into the `bytes` buffer.
65    /// * `bytes` is a buffer containing all the variable-length data concatenated.
66    /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
67    /// * `validity` holds the null values.
68    ///
69    /// # Safety
70    ///
71    /// The caller must ensure all of the following invariants are satisfied:
72    ///
73    /// ## Offsets Requirements
74    ///
75    /// - `offsets` must be a non-nullable integer array.
76    /// - `offsets` must contain at least 1 element (for empty array, it contains \[0\]).
77    /// - All values in `offsets` must be monotonically non-decreasing.
78    /// - The first value in `offsets` must be 0.
79    /// - No offset value may exceed `bytes.len()`.
80    ///
81    /// ## Type Requirements
82    ///
83    /// - `dtype` must be exactly [`DType::Binary`] or [`DType::Utf8`].
84    /// - If `dtype` is [`DType::Utf8`], every byte slice `bytes[offsets[i]..offsets[i+1]]` must be valid UTF-8.
85    /// - `dtype.is_nullable()` must match the nullability of `validity`.
86    ///
87    /// ## Validity Requirements
88    ///
89    /// - If `validity` is [`Validity::Array`], its length must exactly equal `offsets.len() - 1`.
90    pub unsafe fn new_unchecked(
91        offsets: ArrayRef,
92        bytes: ByteBuffer,
93        dtype: DType,
94        validity: Validity,
95    ) -> Self {
96        #[cfg(debug_assertions)]
97        Self::validate(&offsets, &bytes, &dtype, &validity)
98            .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
99
100        Self {
101            dtype,
102            bytes,
103            offsets,
104            validity,
105            stats_set: Default::default(),
106        }
107    }
108
109    /// Validates the components that would be used to create a [`VarBinArray`].
110    ///
111    /// This function checks all the invariants required by [`VarBinArray::new_unchecked`].
112    pub fn validate(
113        offsets: &dyn Array,
114        bytes: &ByteBuffer,
115        dtype: &DType,
116        validity: &Validity,
117    ) -> VortexResult<()> {
118        // Check offsets are non-nullable integer
119        vortex_ensure!(
120            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
121            MismatchedTypes: "non nullable int", offsets.dtype()
122        );
123
124        // Check dtype is Binary or Utf8
125        vortex_ensure!(
126            matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
127            MismatchedTypes: "utf8 or binary", dtype
128        );
129
130        // Check nullability matches
131        vortex_ensure!(
132            dtype.is_nullable() != (validity == &Validity::NonNullable),
133            "incorrect validity {:?} for dtype {}",
134            validity,
135            dtype
136        );
137
138        // Check offsets has at least one element
139        vortex_ensure!(
140            !offsets.is_empty(),
141            "Offsets must have at least one element"
142        );
143
144        // Check offsets are sorted
145        if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
146            vortex_ensure!(is_sorted, "offsets must be sorted");
147        }
148
149        let last_offset = offsets
150            .scalar_at(offsets.len() - 1)
151            .as_primitive()
152            .as_::<usize>()
153            .ok_or_else(|| vortex_err!("Last offset must be convertible to usize"))?;
154        vortex_ensure!(
155            last_offset <= bytes.len(),
156            "Last offset {} exceeds bytes length {}",
157            last_offset,
158            bytes.len()
159        );
160
161        // Check validity length
162        if let Some(validity_len) = validity.maybe_len() {
163            vortex_ensure!(
164                validity_len == offsets.len() - 1,
165                "Validity length {} doesn't match array length {}",
166                validity_len,
167                offsets.len() - 1
168            );
169        }
170
171        // Validate UTF-8 for Utf8 dtype
172        if matches!(dtype, DType::Utf8(_)) {
173            let primitive_offsets = offsets.to_primitive();
174            match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
175                let offsets_slice = primitive_offsets.as_slice::<O>();
176                for (i, (start, end)) in offsets_slice
177                    .windows(2)
178                    .map(|o| (o[0].as_(), o[1].as_()))
179                    .enumerate()
180                {
181                    if validity.is_null(i) {
182                        continue;
183                    }
184
185                    let string_bytes = &bytes.as_ref()[start..end];
186                    simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
187                        #[allow(clippy::unwrap_used)]
188                        // run validation using `compat` package to get more detailed error message
189                        let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
190                        vortex_err!("invalid utf-8: {err} at index {i}")
191                    })?;
192                }
193            });
194        }
195
196        Ok(())
197    }
198
199    #[inline]
200    pub fn offsets(&self) -> &ArrayRef {
201        &self.offsets
202    }
203
204    /// Access the value bytes child buffer
205    ///
206    /// # Note
207    ///
208    /// Bytes child buffer is never sliced when the array is sliced so this can include values
209    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
210    /// unless they're resolving values via the offset child array.
211    #[inline]
212    pub fn bytes(&self) -> &ByteBuffer {
213        &self.bytes
214    }
215
216    /// Access value bytes child array limited to values that are logically present in
217    /// the array unlike [bytes][Self::bytes].
218    pub fn sliced_bytes(&self) -> ByteBuffer {
219        let first_offset: usize = self.offset_at(0);
220        let last_offset = self.offset_at(self.len());
221
222        self.bytes().slice(first_offset..last_offset)
223    }
224
225    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
226        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
227        if size < u32::MAX as usize {
228            Self::from_vec_sized::<u32, T>(vec, dtype)
229        } else {
230            Self::from_vec_sized::<u64, T>(vec, dtype)
231        }
232    }
233
234    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
235    where
236        O: IntegerPType,
237        T: AsRef<[u8]>,
238    {
239        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
240        for v in vec {
241            builder.append_value(v.as_ref());
242        }
243        builder.finish(dtype)
244    }
245
246    #[expect(
247        clippy::same_name_method,
248        reason = "intentionally named from_iter like Iterator::from_iter"
249    )]
250    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
251        iter: I,
252        dtype: DType,
253    ) -> Self {
254        let iter = iter.into_iter();
255        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
256        for v in iter {
257            builder.append(v.as_ref().map(|o| o.as_ref()));
258        }
259        builder.finish(dtype)
260    }
261
262    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
263        iter: I,
264        dtype: DType,
265    ) -> Self {
266        let iter = iter.into_iter();
267        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
268        for v in iter {
269            builder.append_value(v);
270        }
271        builder.finish(dtype)
272    }
273
274    /// Get value offset at a given index
275    ///
276    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
277    ///
278    /// Panics if index is out of bounds
279    pub fn offset_at(&self, index: usize) -> usize {
280        assert!(
281            index <= self.len(),
282            "Index {index} out of bounds 0..={}",
283            self.len()
284        );
285
286        self.offsets()
287            .scalar_at(index)
288            .as_ref()
289            .try_into()
290            .vortex_expect("Failed to convert offset to usize")
291    }
292
293    /// Access value bytes at a given index
294    ///
295    /// Will return buffer referencing underlying data without performing a copy
296    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
297        let start = self.offset_at(index);
298        let end = self.offset_at(index + 1);
299
300        self.bytes().slice(start..end)
301    }
302
303    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
304    /// the `offsets` array, and the `validity`.
305    pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
306        (self.dtype, self.bytes, self.offsets, self.validity)
307    }
308}
309
310impl From<Vec<&[u8]>> for VarBinArray {
311    fn from(value: Vec<&[u8]>) -> Self {
312        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
313    }
314}
315
316impl From<Vec<Vec<u8>>> for VarBinArray {
317    fn from(value: Vec<Vec<u8>>) -> Self {
318        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
319    }
320}
321
322impl From<Vec<String>> for VarBinArray {
323    fn from(value: Vec<String>) -> Self {
324        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
325    }
326}
327
328impl From<Vec<&str>> for VarBinArray {
329    fn from(value: Vec<&str>) -> Self {
330        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
331    }
332}
333
334impl From<Vec<Option<&[u8]>>> for VarBinArray {
335    fn from(value: Vec<Option<&[u8]>>) -> Self {
336        Self::from_iter(value, DType::Binary(Nullability::Nullable))
337    }
338}
339
340impl From<Vec<Option<Vec<u8>>>> for VarBinArray {
341    fn from(value: Vec<Option<Vec<u8>>>) -> Self {
342        Self::from_iter(value, DType::Binary(Nullability::Nullable))
343    }
344}
345
346impl From<Vec<Option<String>>> for VarBinArray {
347    fn from(value: Vec<Option<String>>) -> Self {
348        Self::from_iter(value, DType::Utf8(Nullability::Nullable))
349    }
350}
351
352impl From<Vec<Option<&str>>> for VarBinArray {
353    fn from(value: Vec<Option<&str>>) -> Self {
354        Self::from_iter(value, DType::Utf8(Nullability::Nullable))
355    }
356}
357
358impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
359    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
360        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
361    }
362}
363
364impl FromIterator<Option<Vec<u8>>> for VarBinArray {
365    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
366        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
367    }
368}
369
370impl FromIterator<Option<String>> for VarBinArray {
371    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
372        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
373    }
374}
375
376impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
377    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
378        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
379    }
380}