Skip to main content

vortex_array/arrays/varbin/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::AsPrimitive;
5use vortex_buffer::Buffer;
6use vortex_buffer::ByteBuffer;
7use vortex_dtype::DType;
8use vortex_dtype::IntegerPType;
9use vortex_dtype::Nullability;
10use vortex_dtype::match_each_integer_ptype;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_err;
15
16use crate::Array;
17use crate::ArrayRef;
18use crate::IntoArray;
19use crate::ToCanonical;
20use crate::arrays::varbin::builder::VarBinBuilder;
21use crate::buffer::BufferHandle;
22use crate::stats::ArrayStats;
23use crate::validity::Validity;
24
25#[derive(Clone, Debug)]
26pub struct VarBinArray {
27    pub(super) dtype: DType,
28    pub(super) bytes: BufferHandle,
29    pub(super) offsets: ArrayRef,
30    pub(super) validity: Validity,
31    pub(super) stats_set: ArrayStats,
32}
33
34impl VarBinArray {
35    /// Creates a new [`VarBinArray`].
36    ///
37    /// # Panics
38    ///
39    /// Panics if the provided components do not satisfy the invariants documented
40    /// in [`VarBinArray::new_unchecked`].
41    pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
42        Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
43    }
44
45    /// Creates a new [`VarBinArray`].
46    ///
47    /// # Panics
48    ///
49    /// Panics if the provided components do not satisfy the invariants documented
50    /// in [`VarBinArray::new_unchecked`].
51    pub fn new_from_handle(
52        offset: ArrayRef,
53        bytes: BufferHandle,
54        dtype: DType,
55        validity: Validity,
56    ) -> Self {
57        Self::try_new_from_handle(offset, bytes, dtype, validity).vortex_expect("VarBinArray new")
58    }
59
60    /// Constructs a new `VarBinArray`.
61    ///
62    /// See [`VarBinArray::new_unchecked`] for more information.
63    ///
64    /// # Errors
65    ///
66    /// Returns an error if the provided components do not satisfy the invariants documented in
67    /// [`VarBinArray::new_unchecked`].
68    pub fn try_new(
69        offsets: ArrayRef,
70        bytes: ByteBuffer,
71        dtype: DType,
72        validity: Validity,
73    ) -> VortexResult<Self> {
74        let bytes = BufferHandle::new_host(bytes);
75        Self::validate(&offsets, &bytes, &dtype, &validity)?;
76
77        // SAFETY: validate ensures all invariants are met.
78        Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
79    }
80
81    /// Constructs a new `VarBinArray` from a `BufferHandle` of memory that may exist
82    /// on the CPU or GPU.
83    ///
84    /// See [`VarBinArray::new_unchecked`] for more information.
85    ///
86    /// # Errors
87    ///
88    /// Returns an error if the provided components do not satisfy the invariants documented in
89    /// [`VarBinArray::new_unchecked`].
90    pub fn try_new_from_handle(
91        offsets: ArrayRef,
92        bytes: BufferHandle,
93        dtype: DType,
94        validity: Validity,
95    ) -> VortexResult<Self> {
96        Self::validate(&offsets, &bytes, &dtype, &validity)?;
97
98        // SAFETY: validate ensures all invariants are met.
99        Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
100    }
101
102    /// Creates a new [`VarBinArray`] without validation from these components:
103    ///
104    /// * `offsets` is an array of byte offsets into the `bytes` buffer.
105    /// * `bytes` is a buffer containing all the variable-length data concatenated.
106    /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
107    /// * `validity` holds the null values.
108    ///
109    /// # Safety
110    ///
111    /// The caller must ensure all of the following invariants are satisfied:
112    ///
113    /// ## Offsets Requirements
114    ///
115    /// - `offsets` must be a non-nullable integer array.
116    /// - `offsets` must contain at least 1 element (for empty array, it contains \[0\]).
117    /// - All values in `offsets` must be monotonically non-decreasing.
118    /// - The first value in `offsets` must be 0.
119    /// - No offset value may exceed `bytes.len()`.
120    ///
121    /// ## Type Requirements
122    ///
123    /// - `dtype` must be exactly [`DType::Binary`] or [`DType::Utf8`].
124    /// - If `dtype` is [`DType::Utf8`], every byte slice `bytes[offsets[i]..offsets[i+1]]` must be valid UTF-8.
125    /// - `dtype.is_nullable()` must match the nullability of `validity`.
126    ///
127    /// ## Validity Requirements
128    ///
129    /// - If `validity` is [`Validity::Array`], its length must exactly equal `offsets.len() - 1`.
130    pub unsafe fn new_unchecked(
131        offsets: ArrayRef,
132        bytes: ByteBuffer,
133        dtype: DType,
134        validity: Validity,
135    ) -> Self {
136        // SAFETY: `new_unchecked_from_handle` has same invariants which should be checked
137        //  by caller.
138        unsafe {
139            Self::new_unchecked_from_handle(offsets, BufferHandle::new_host(bytes), dtype, validity)
140        }
141    }
142
143    /// Creates a new [`VarBinArray`] without validation from its components, with string data
144    /// stored in a `BufferHandle` (CPU or GPU).
145    ///
146    /// # Safety
147    ///
148    /// The caller must ensure all the invariants documented in `new_unchecked` are satisfied.
149    pub unsafe fn new_unchecked_from_handle(
150        offsets: ArrayRef,
151        bytes: BufferHandle,
152        dtype: DType,
153        validity: Validity,
154    ) -> Self {
155        #[cfg(debug_assertions)]
156        Self::validate(&offsets, &bytes, &dtype, &validity)
157            .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
158
159        Self {
160            dtype,
161            bytes,
162            offsets,
163            validity,
164            stats_set: Default::default(),
165        }
166    }
167
168    /// Validates the components that would be used to create a [`VarBinArray`].
169    ///
170    /// This function checks all the invariants required by [`VarBinArray::new_unchecked`].
171    pub fn validate(
172        offsets: &dyn Array,
173        bytes: &BufferHandle,
174        dtype: &DType,
175        validity: &Validity,
176    ) -> VortexResult<()> {
177        // Check offsets are non-nullable integer
178        vortex_ensure!(
179            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
180            MismatchedTypes: "non nullable int", offsets.dtype()
181        );
182
183        // Check dtype is Binary or Utf8
184        vortex_ensure!(
185            matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
186            MismatchedTypes: "utf8 or binary", dtype
187        );
188
189        // Check nullability matches
190        vortex_ensure!(
191            dtype.is_nullable() != (validity == &Validity::NonNullable),
192            InvalidArgument: "incorrect validity {:?} for dtype {}",
193            validity,
194            dtype
195        );
196
197        // Check offsets has at least one element
198        vortex_ensure!(
199            !offsets.is_empty(),
200            InvalidArgument: "Offsets must have at least one element"
201        );
202
203        // Skip host-only validation when offsets/bytes are not host-resident.
204        if offsets.is_host() && bytes.is_on_host() {
205            let last_offset = offsets
206                .scalar_at(offsets.len() - 1)?
207                .as_primitive()
208                .as_::<usize>()
209                .ok_or_else(
210                    || vortex_err!(InvalidArgument: "Last offset must be convertible to usize"),
211                )?;
212            vortex_ensure!(
213                last_offset <= bytes.len(),
214                InvalidArgument: "Last offset {} exceeds bytes length {}",
215                last_offset,
216                bytes.len()
217            );
218        }
219
220        // Check validity length
221        if let Some(validity_len) = validity.maybe_len() {
222            vortex_ensure!(
223                validity_len == offsets.len() - 1,
224                "Validity length {} doesn't match array length {}",
225                validity_len,
226                offsets.len() - 1
227            );
228        }
229
230        // Validate UTF-8 for Utf8 dtype. Skip when offsets/bytes are not host-resident.
231        if offsets.is_host()
232            && bytes.is_on_host()
233            && matches!(dtype, DType::Utf8(_))
234            && let Some(bytes) = bytes.as_host_opt()
235        {
236            let primitive_offsets = offsets.to_primitive();
237            match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
238                let offsets_slice = primitive_offsets.as_slice::<O>();
239                for (i, (start, end)) in offsets_slice
240                    .windows(2)
241                    .map(|o| (o[0].as_(), o[1].as_()))
242                    .enumerate()
243                {
244                    if validity.is_null(i)? {
245                        continue;
246                    }
247
248                    let string_bytes = &bytes.as_ref()[start..end];
249                    simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
250                        #[allow(clippy::unwrap_used)]
251                        // run validation using `compat` package to get more detailed error message
252                        let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
253                        vortex_err!("invalid utf-8: {err} at index {i}")
254                    })?;
255                }
256            });
257        }
258
259        Ok(())
260    }
261
262    #[inline]
263    pub fn offsets(&self) -> &ArrayRef {
264        &self.offsets
265    }
266
267    /// Access the value bytes child buffer
268    ///
269    /// # Note
270    ///
271    /// Bytes child buffer is never sliced when the array is sliced so this can include values
272    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
273    /// unless they're resolving values via the offset child array.
274    #[inline]
275    pub fn bytes(&self) -> &ByteBuffer {
276        self.bytes.as_host()
277    }
278
279    /// Access the value bytes buffer handle.
280    #[inline]
281    pub fn bytes_handle(&self) -> &BufferHandle {
282        &self.bytes
283    }
284
285    /// Access value bytes child array limited to values that are logically present in
286    /// the array unlike [bytes][Self::bytes].
287    pub fn sliced_bytes(&self) -> ByteBuffer {
288        let first_offset: usize = self.offset_at(0);
289        let last_offset = self.offset_at(self.len());
290
291        self.bytes().slice(first_offset..last_offset)
292    }
293
294    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
295        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
296        if size < u32::MAX as usize {
297            Self::from_vec_sized::<u32, T>(vec, dtype)
298        } else {
299            Self::from_vec_sized::<u64, T>(vec, dtype)
300        }
301    }
302
303    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
304    where
305        O: IntegerPType,
306        T: AsRef<[u8]>,
307    {
308        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
309        for v in vec {
310            builder.append_value(v.as_ref());
311        }
312        builder.finish(dtype)
313    }
314
315    #[expect(
316        clippy::same_name_method,
317        reason = "intentionally named from_iter like Iterator::from_iter"
318    )]
319    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
320        iter: I,
321        dtype: DType,
322    ) -> Self {
323        let iter = iter.into_iter();
324        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
325        for v in iter {
326            builder.append(v.as_ref().map(|o| o.as_ref()));
327        }
328        builder.finish(dtype)
329    }
330
331    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
332        iter: I,
333        dtype: DType,
334    ) -> Self {
335        let iter = iter.into_iter();
336        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
337        for v in iter {
338            builder.append_value(v);
339        }
340        builder.finish(dtype)
341    }
342
343    /// Get value offset at a given index
344    ///
345    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
346    ///
347    /// Panics if index is out of bounds
348    pub fn offset_at(&self, index: usize) -> usize {
349        assert!(
350            index <= self.len(),
351            "Index {index} out of bounds 0..={}",
352            self.len()
353        );
354
355        (&self
356            .offsets()
357            .scalar_at(index)
358            .vortex_expect("offsets must support scalar_at"))
359            .try_into()
360            .vortex_expect("Failed to convert offset to usize")
361    }
362
363    /// Access value bytes at a given index
364    ///
365    /// Will return buffer referencing underlying data without performing a copy
366    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
367        let start = self.offset_at(index);
368        let end = self.offset_at(index + 1);
369
370        self.bytes().slice(start..end)
371    }
372
373    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
374    /// the `offsets` array, and the `validity`.
375    pub fn into_parts(self) -> (DType, BufferHandle, ArrayRef, Validity) {
376        (self.dtype, self.bytes, self.offsets, self.validity)
377    }
378}
379
380impl VarBinArray {
381    /// Return an array containing the same data, but where the internal `offsets` start at zero
382    /// and all wasted space in the bytes child has been clipped.
383    #[doc(hidden)]
384    pub fn zero_offsets(self) -> Self {
385        if self.is_empty() {
386            return self;
387        }
388
389        let first = self.offset_at(0);
390
391        let bytes = self.sliced_bytes();
392        let dtype = self.dtype;
393        let validity = self.validity;
394        let offsets = self.offsets;
395
396        let offsets = if first == 0 {
397            offsets
398        } else {
399            let offsets = offsets.to_primitive();
400            match_each_integer_ptype!(offsets.ptype(), |P| {
401                let offsets = offsets.as_slice::<P>();
402                let buffer: Buffer<P> = offsets.iter().map(|index| index - offsets[0]).collect();
403                buffer.into_array()
404            })
405        };
406
407        // SAFETY: we make the first offset start at zero, and slice the bytes accordingly,
408        //  so all offsets stay valid.
409        unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) }
410    }
411}
412
413impl From<Vec<&[u8]>> for VarBinArray {
414    fn from(value: Vec<&[u8]>) -> Self {
415        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
416    }
417}
418
419impl From<Vec<Vec<u8>>> for VarBinArray {
420    fn from(value: Vec<Vec<u8>>) -> Self {
421        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
422    }
423}
424
425impl From<Vec<String>> for VarBinArray {
426    fn from(value: Vec<String>) -> Self {
427        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
428    }
429}
430
431impl From<Vec<&str>> for VarBinArray {
432    fn from(value: Vec<&str>) -> Self {
433        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
434    }
435}
436
437impl From<Vec<Option<&[u8]>>> for VarBinArray {
438    fn from(value: Vec<Option<&[u8]>>) -> Self {
439        Self::from_iter(value, DType::Binary(Nullability::Nullable))
440    }
441}
442
443impl From<Vec<Option<Vec<u8>>>> for VarBinArray {
444    fn from(value: Vec<Option<Vec<u8>>>) -> Self {
445        Self::from_iter(value, DType::Binary(Nullability::Nullable))
446    }
447}
448
449impl From<Vec<Option<String>>> for VarBinArray {
450    fn from(value: Vec<Option<String>>) -> Self {
451        Self::from_iter(value, DType::Utf8(Nullability::Nullable))
452    }
453}
454
455impl From<Vec<Option<&str>>> for VarBinArray {
456    fn from(value: Vec<Option<&str>>) -> Self {
457        Self::from_iter(value, DType::Utf8(Nullability::Nullable))
458    }
459}
460
461impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
462    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
463        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
464    }
465}
466
467impl FromIterator<Option<Vec<u8>>> for VarBinArray {
468    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
469        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
470    }
471}
472
473impl FromIterator<Option<String>> for VarBinArray {
474    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
475        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
476    }
477}
478
479impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
480    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
481        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
482    }
483}