Skip to main content

vortex_array/arrays/varbin/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::AsPrimitive;
5use vortex_buffer::ByteBuffer;
6use vortex_error::VortexExpect;
7use vortex_error::VortexResult;
8use vortex_error::vortex_ensure;
9use vortex_error::vortex_err;
10
11use crate::Array;
12use crate::ArrayRef;
13use crate::ToCanonical;
14use crate::arrays::varbin::builder::VarBinBuilder;
15use crate::buffer::BufferHandle;
16use crate::dtype::DType;
17use crate::dtype::IntegerPType;
18use crate::dtype::Nullability;
19use crate::match_each_integer_ptype;
20use crate::stats::ArrayStats;
21use crate::validity::Validity;
22
23#[derive(Clone, Debug)]
24pub struct VarBinArray {
25    pub(super) dtype: DType,
26    pub(super) bytes: BufferHandle,
27    pub(super) offsets: ArrayRef,
28    pub(super) validity: Validity,
29    pub(super) stats_set: ArrayStats,
30}
31
32impl VarBinArray {
33    /// Creates a new [`VarBinArray`].
34    ///
35    /// # Panics
36    ///
37    /// Panics if the provided components do not satisfy the invariants documented
38    /// in [`VarBinArray::new_unchecked`].
39    pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
40        Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
41    }
42
43    /// Creates a new [`VarBinArray`].
44    ///
45    /// # Panics
46    ///
47    /// Panics if the provided components do not satisfy the invariants documented
48    /// in [`VarBinArray::new_unchecked`].
49    pub fn new_from_handle(
50        offset: ArrayRef,
51        bytes: BufferHandle,
52        dtype: DType,
53        validity: Validity,
54    ) -> Self {
55        Self::try_new_from_handle(offset, bytes, dtype, validity).vortex_expect("VarBinArray new")
56    }
57
58    /// Constructs a new `VarBinArray`.
59    ///
60    /// See [`VarBinArray::new_unchecked`] for more information.
61    ///
62    /// # Errors
63    ///
64    /// Returns an error if the provided components do not satisfy the invariants documented in
65    /// [`VarBinArray::new_unchecked`].
66    pub fn try_new(
67        offsets: ArrayRef,
68        bytes: ByteBuffer,
69        dtype: DType,
70        validity: Validity,
71    ) -> VortexResult<Self> {
72        let bytes = BufferHandle::new_host(bytes);
73        Self::validate(&offsets, &bytes, &dtype, &validity)?;
74
75        // SAFETY: validate ensures all invariants are met.
76        Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
77    }
78
79    /// Constructs a new `VarBinArray` from a `BufferHandle` of memory that may exist
80    /// on the CPU or GPU.
81    ///
82    /// See [`VarBinArray::new_unchecked`] for more information.
83    ///
84    /// # Errors
85    ///
86    /// Returns an error if the provided components do not satisfy the invariants documented in
87    /// [`VarBinArray::new_unchecked`].
88    pub fn try_new_from_handle(
89        offsets: ArrayRef,
90        bytes: BufferHandle,
91        dtype: DType,
92        validity: Validity,
93    ) -> VortexResult<Self> {
94        Self::validate(&offsets, &bytes, &dtype, &validity)?;
95
96        // SAFETY: validate ensures all invariants are met.
97        Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
98    }
99
100    /// Creates a new [`VarBinArray`] without validation from these components:
101    ///
102    /// * `offsets` is an array of byte offsets into the `bytes` buffer.
103    /// * `bytes` is a buffer containing all the variable-length data concatenated.
104    /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
105    /// * `validity` holds the null values.
106    ///
107    /// # Safety
108    ///
109    /// The caller must ensure all of the following invariants are satisfied:
110    ///
111    /// ## Offsets Requirements
112    ///
113    /// - `offsets` must be a non-nullable integer array.
114    /// - `offsets` must contain at least 1 element (for empty array, it contains \[0\]).
115    /// - All values in `offsets` must be monotonically non-decreasing.
116    /// - The first value in `offsets` must be 0.
117    /// - No offset value may exceed `bytes.len()`.
118    ///
119    /// ## Type Requirements
120    ///
121    /// - `dtype` must be exactly [`DType::Binary`] or [`DType::Utf8`].
122    /// - If `dtype` is [`DType::Utf8`], every byte slice `bytes[offsets[i]..offsets[i+1]]` must be valid UTF-8.
123    /// - `dtype.is_nullable()` must match the nullability of `validity`.
124    ///
125    /// ## Validity Requirements
126    ///
127    /// - If `validity` is [`Validity::Array`], its length must exactly equal `offsets.len() - 1`.
128    pub unsafe fn new_unchecked(
129        offsets: ArrayRef,
130        bytes: ByteBuffer,
131        dtype: DType,
132        validity: Validity,
133    ) -> Self {
134        // SAFETY: `new_unchecked_from_handle` has same invariants which should be checked
135        //  by caller.
136        unsafe {
137            Self::new_unchecked_from_handle(offsets, BufferHandle::new_host(bytes), dtype, validity)
138        }
139    }
140
141    /// Creates a new [`VarBinArray`] without validation from its components, with string data
142    /// stored in a `BufferHandle` (CPU or GPU).
143    ///
144    /// # Safety
145    ///
146    /// The caller must ensure all the invariants documented in `new_unchecked` are satisfied.
147    pub unsafe fn new_unchecked_from_handle(
148        offsets: ArrayRef,
149        bytes: BufferHandle,
150        dtype: DType,
151        validity: Validity,
152    ) -> Self {
153        #[cfg(debug_assertions)]
154        Self::validate(&offsets, &bytes, &dtype, &validity)
155            .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
156
157        Self {
158            dtype,
159            bytes,
160            offsets,
161            validity,
162            stats_set: Default::default(),
163        }
164    }
165
166    /// Validates the components that would be used to create a [`VarBinArray`].
167    ///
168    /// This function checks all the invariants required by [`VarBinArray::new_unchecked`].
169    pub fn validate(
170        offsets: &ArrayRef,
171        bytes: &BufferHandle,
172        dtype: &DType,
173        validity: &Validity,
174    ) -> VortexResult<()> {
175        // Check offsets are non-nullable integer
176        vortex_ensure!(
177            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
178            MismatchedTypes: "non nullable int", offsets.dtype()
179        );
180
181        // Check dtype is Binary or Utf8
182        vortex_ensure!(
183            matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
184            MismatchedTypes: "utf8 or binary", dtype
185        );
186
187        // Check nullability matches
188        vortex_ensure!(
189            dtype.is_nullable() != (validity == &Validity::NonNullable),
190            InvalidArgument: "incorrect validity {:?} for dtype {}",
191            validity,
192            dtype
193        );
194
195        // Check offsets has at least one element
196        vortex_ensure!(
197            !offsets.is_empty(),
198            InvalidArgument: "Offsets must have at least one element"
199        );
200
201        // Skip host-only validation when offsets/bytes are not host-resident.
202        if offsets.is_host() && bytes.is_on_host() {
203            let last_offset = offsets
204                .scalar_at(offsets.len() - 1)?
205                .as_primitive()
206                .as_::<usize>()
207                .ok_or_else(
208                    || vortex_err!(InvalidArgument: "Last offset must be convertible to usize"),
209                )?;
210            vortex_ensure!(
211                last_offset <= bytes.len(),
212                InvalidArgument: "Last offset {} exceeds bytes length {}",
213                last_offset,
214                bytes.len()
215            );
216        }
217
218        // Check validity length
219        if let Some(validity_len) = validity.maybe_len() {
220            vortex_ensure!(
221                validity_len == offsets.len() - 1,
222                "Validity length {} doesn't match array length {}",
223                validity_len,
224                offsets.len() - 1
225            );
226        }
227
228        // Validate UTF-8 for Utf8 dtype. Skip when offsets/bytes are not host-resident.
229        if offsets.is_host()
230            && bytes.is_on_host()
231            && matches!(dtype, DType::Utf8(_))
232            && let Some(bytes) = bytes.as_host_opt()
233        {
234            let primitive_offsets = offsets.to_primitive();
235            match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
236                let offsets_slice = primitive_offsets.as_slice::<O>();
237                for (i, (start, end)) in offsets_slice
238                    .windows(2)
239                    .map(|o| (o[0].as_(), o[1].as_()))
240                    .enumerate()
241                {
242                    if validity.is_null(i)? {
243                        continue;
244                    }
245
246                    let string_bytes = &bytes.as_ref()[start..end];
247                    simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
248                        #[allow(clippy::unwrap_used)]
249                        // run validation using `compat` package to get more detailed error message
250                        let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
251                        vortex_err!("invalid utf-8: {err} at index {i}")
252                    })?;
253                }
254            });
255        }
256
257        Ok(())
258    }
259
260    #[inline]
261    pub fn offsets(&self) -> &ArrayRef {
262        &self.offsets
263    }
264
265    /// Access the value bytes child buffer
266    ///
267    /// # Note
268    ///
269    /// Bytes child buffer is never sliced when the array is sliced so this can include values
270    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
271    /// unless they're resolving values via the offset child array.
272    #[inline]
273    pub fn bytes(&self) -> &ByteBuffer {
274        self.bytes.as_host()
275    }
276
277    /// Access the value bytes buffer handle.
278    #[inline]
279    pub fn bytes_handle(&self) -> &BufferHandle {
280        &self.bytes
281    }
282
283    /// Access value bytes child array limited to values that are logically present in
284    /// the array unlike [bytes][Self::bytes].
285    pub fn sliced_bytes(&self) -> ByteBuffer {
286        let first_offset: usize = self.offset_at(0);
287        let last_offset = self.offset_at(self.len());
288
289        self.bytes().slice(first_offset..last_offset)
290    }
291
292    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
293        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
294        if size < u32::MAX as usize {
295            Self::from_vec_sized::<u32, T>(vec, dtype)
296        } else {
297            Self::from_vec_sized::<u64, T>(vec, dtype)
298        }
299    }
300
301    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
302    where
303        O: IntegerPType,
304        T: AsRef<[u8]>,
305    {
306        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
307        for v in vec {
308            builder.append_value(v.as_ref());
309        }
310        builder.finish(dtype)
311    }
312
313    #[expect(
314        clippy::same_name_method,
315        reason = "intentionally named from_iter like Iterator::from_iter"
316    )]
317    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
318        iter: I,
319        dtype: DType,
320    ) -> Self {
321        let iter = iter.into_iter();
322        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
323        for v in iter {
324            builder.append(v.as_ref().map(|o| o.as_ref()));
325        }
326        builder.finish(dtype)
327    }
328
329    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
330        iter: I,
331        dtype: DType,
332    ) -> Self {
333        let iter = iter.into_iter();
334        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
335        for v in iter {
336            builder.append_value(v);
337        }
338        builder.finish(dtype)
339    }
340
341    /// Get value offset at a given index
342    ///
343    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
344    ///
345    /// Panics if index is out of bounds
346    pub fn offset_at(&self, index: usize) -> usize {
347        assert!(
348            index <= self.len(),
349            "Index {index} out of bounds 0..={}",
350            self.len()
351        );
352
353        (&self
354            .offsets()
355            .scalar_at(index)
356            .vortex_expect("offsets must support scalar_at"))
357            .try_into()
358            .vortex_expect("Failed to convert offset to usize")
359    }
360
361    /// Access value bytes at a given index
362    ///
363    /// Will return buffer referencing underlying data without performing a copy
364    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
365        let start = self.offset_at(index);
366        let end = self.offset_at(index + 1);
367
368        self.bytes().slice(start..end)
369    }
370
371    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
372    /// the `offsets` array, and the `validity`.
373    pub fn into_parts(self) -> (DType, BufferHandle, ArrayRef, Validity) {
374        (self.dtype, self.bytes, self.offsets, self.validity)
375    }
376}
377
378impl From<Vec<&[u8]>> for VarBinArray {
379    fn from(value: Vec<&[u8]>) -> Self {
380        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
381    }
382}
383
384impl From<Vec<Vec<u8>>> for VarBinArray {
385    fn from(value: Vec<Vec<u8>>) -> Self {
386        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
387    }
388}
389
390impl From<Vec<String>> for VarBinArray {
391    fn from(value: Vec<String>) -> Self {
392        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
393    }
394}
395
396impl From<Vec<&str>> for VarBinArray {
397    fn from(value: Vec<&str>) -> Self {
398        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
399    }
400}
401
402impl From<Vec<Option<&[u8]>>> for VarBinArray {
403    fn from(value: Vec<Option<&[u8]>>) -> Self {
404        Self::from_iter(value, DType::Binary(Nullability::Nullable))
405    }
406}
407
408impl From<Vec<Option<Vec<u8>>>> for VarBinArray {
409    fn from(value: Vec<Option<Vec<u8>>>) -> Self {
410        Self::from_iter(value, DType::Binary(Nullability::Nullable))
411    }
412}
413
414impl From<Vec<Option<String>>> for VarBinArray {
415    fn from(value: Vec<Option<String>>) -> Self {
416        Self::from_iter(value, DType::Utf8(Nullability::Nullable))
417    }
418}
419
420impl From<Vec<Option<&str>>> for VarBinArray {
421    fn from(value: Vec<Option<&str>>) -> Self {
422        Self::from_iter(value, DType::Utf8(Nullability::Nullable))
423    }
424}
425
426impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
427    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
428        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
429    }
430}
431
432impl FromIterator<Option<Vec<u8>>> for VarBinArray {
433    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
434        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
435    }
436}
437
438impl FromIterator<Option<String>> for VarBinArray {
439    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
440        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
441    }
442}
443
444impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
445    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
446        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
447    }
448}