vortex_array/arrays/varbin/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::AsPrimitive;
5use vortex_buffer::ByteBuffer;
6use vortex_dtype::{DType, IntegerPType, Nullability, match_each_integer_ptype};
7use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_err};
8
9use crate::arrays::varbin::builder::VarBinBuilder;
10use crate::stats::ArrayStats;
11use crate::validity::Validity;
12use crate::{Array, ArrayRef, ToCanonical};
13
14#[derive(Clone, Debug)]
15pub struct VarBinArray {
16    pub(super) dtype: DType,
17    bytes: ByteBuffer,
18    offsets: ArrayRef,
19    pub(super) validity: Validity,
20    pub(super) stats_set: ArrayStats,
21}
22
23impl VarBinArray {
24    /// Creates a new [`VarBinArray`].
25    ///
26    /// # Panics
27    ///
28    /// Panics if the provided components do not satisfy the invariants documented
29    /// in [`VarBinArray::new_unchecked`].
30    pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
31        Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
32    }
33
34    /// Constructs a new `VarBinArray`.
35    ///
36    /// See [`VarBinArray::new_unchecked`] for more information.
37    ///
38    /// # Errors
39    ///
40    /// Returns an error if the provided components do not satisfy the invariants documented in
41    /// [`VarBinArray::new_unchecked`].
42    pub fn try_new(
43        offsets: ArrayRef,
44        bytes: ByteBuffer,
45        dtype: DType,
46        validity: Validity,
47    ) -> VortexResult<Self> {
48        Self::validate(&offsets, &bytes, &dtype, &validity)?;
49
50        // SAFETY: validate ensures all invariants are met.
51        Ok(unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) })
52    }
53
54    /// Creates a new [`VarBinArray`] without validation from these components:
55    ///
56    /// * `offsets` is an array of byte offsets into the `bytes` buffer.
57    /// * `bytes` is a buffer containing all the variable-length data concatenated.
58    /// * `dtype` specifies whether this contains UTF-8 strings or binary data.
59    /// * `validity` holds the null values.
60    ///
61    /// # Safety
62    ///
63    /// The caller must ensure all of the following invariants are satisfied:
64    ///
65    /// ## Offsets Requirements
66    ///
67    /// - `offsets` must be a non-nullable integer array.
68    /// - `offsets` must contain at least 1 element (for empty array, it contains \[0\]).
69    /// - All values in `offsets` must be monotonically non-decreasing.
70    /// - The first value in `offsets` must be 0.
71    /// - No offset value may exceed `bytes.len()`.
72    ///
73    /// ## Type Requirements
74    ///
75    /// - `dtype` must be exactly [`DType::Binary`] or [`DType::Utf8`].
76    /// - If `dtype` is [`DType::Utf8`], every byte slice `bytes[offsets[i]..offsets[i+1]]` must be valid UTF-8.
77    /// - `dtype.is_nullable()` must match the nullability of `validity`.
78    ///
79    /// ## Validity Requirements
80    ///
81    /// - If `validity` is [`Validity::Array`], its length must exactly equal `offsets.len() - 1`.
82    pub unsafe fn new_unchecked(
83        offsets: ArrayRef,
84        bytes: ByteBuffer,
85        dtype: DType,
86        validity: Validity,
87    ) -> Self {
88        #[cfg(debug_assertions)]
89        Self::validate(&offsets, &bytes, &dtype, &validity)
90            .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
91
92        Self {
93            dtype,
94            bytes,
95            offsets,
96            validity,
97            stats_set: Default::default(),
98        }
99    }
100
101    /// Validates the components that would be used to create a [`VarBinArray`].
102    ///
103    /// This function checks all the invariants required by [`VarBinArray::new_unchecked`].
104    pub fn validate(
105        offsets: &dyn Array,
106        bytes: &ByteBuffer,
107        dtype: &DType,
108        validity: &Validity,
109    ) -> VortexResult<()> {
110        // Check offsets are non-nullable integer
111        vortex_ensure!(
112            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
113            MismatchedTypes: "non nullable int", offsets.dtype()
114        );
115
116        // Check dtype is Binary or Utf8
117        vortex_ensure!(
118            matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
119            MismatchedTypes: "utf8 or binary", dtype
120        );
121
122        // Check nullability matches
123        vortex_ensure!(
124            dtype.is_nullable() != (validity == &Validity::NonNullable),
125            "incorrect validity {:?} for dtype {}",
126            validity,
127            dtype
128        );
129
130        // Check offsets has at least one element
131        vortex_ensure!(
132            !offsets.is_empty(),
133            "Offsets must have at least one element"
134        );
135
136        // Check offsets are sorted
137        if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
138            vortex_ensure!(is_sorted, "offsets must be sorted");
139        }
140
141        let last_offset = offsets
142            .scalar_at(offsets.len() - 1)
143            .as_primitive()
144            .as_::<usize>()
145            .ok_or_else(|| vortex_err!("Last offset must be convertible to usize"))?;
146        vortex_ensure!(
147            last_offset <= bytes.len(),
148            "Last offset {} exceeds bytes length {}",
149            last_offset,
150            bytes.len()
151        );
152
153        // Check validity length
154        if let Some(validity_len) = validity.maybe_len() {
155            vortex_ensure!(
156                validity_len == offsets.len() - 1,
157                "Validity length {} doesn't match array length {}",
158                validity_len,
159                offsets.len() - 1
160            );
161        }
162
163        // Validate UTF-8 for Utf8 dtype
164        if matches!(dtype, DType::Utf8(_)) {
165            let primitive_offsets = offsets.to_primitive();
166            match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
167                let offsets_slice = primitive_offsets.as_slice::<O>();
168                for (i, (start, end)) in offsets_slice
169                    .windows(2)
170                    .map(|o| (o[0].as_(), o[1].as_()))
171                    .enumerate()
172                {
173                    if validity.is_null(i) {
174                        continue;
175                    }
176
177                    let string_bytes = &bytes.as_ref()[start..end];
178                    simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
179                        #[allow(clippy::unwrap_used)]
180                        // run validation using `compat` package to get more detailed error message
181                        let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
182                        vortex_err!("invalid utf-8: {err} at index {i}")
183                    })?;
184                }
185            });
186        }
187
188        Ok(())
189    }
190
191    #[inline]
192    pub fn offsets(&self) -> &ArrayRef {
193        &self.offsets
194    }
195
196    /// Access the value bytes child buffer
197    ///
198    /// # Note
199    ///
200    /// Bytes child buffer is never sliced when the array is sliced so this can include values
201    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
202    /// unless they're resolving values via the offset child array.
203    #[inline]
204    pub fn bytes(&self) -> &ByteBuffer {
205        &self.bytes
206    }
207
208    /// Access value bytes child array limited to values that are logically present in
209    /// the array unlike [bytes][Self::bytes].
210    pub fn sliced_bytes(&self) -> ByteBuffer {
211        let first_offset: usize = self.offset_at(0);
212        let last_offset = self.offset_at(self.len());
213
214        self.bytes().slice(first_offset..last_offset)
215    }
216
217    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
218        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
219        if size < u32::MAX as usize {
220            Self::from_vec_sized::<u32, T>(vec, dtype)
221        } else {
222            Self::from_vec_sized::<u64, T>(vec, dtype)
223        }
224    }
225
226    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
227    where
228        O: IntegerPType,
229        T: AsRef<[u8]>,
230    {
231        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
232        for v in vec {
233            builder.append_value(v.as_ref());
234        }
235        builder.finish(dtype)
236    }
237
238    #[allow(clippy::same_name_method)]
239    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
240        iter: I,
241        dtype: DType,
242    ) -> Self {
243        let iter = iter.into_iter();
244        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
245        for v in iter {
246            builder.append(v.as_ref().map(|o| o.as_ref()));
247        }
248        builder.finish(dtype)
249    }
250
251    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
252        iter: I,
253        dtype: DType,
254    ) -> Self {
255        let iter = iter.into_iter();
256        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
257        for v in iter {
258            builder.append_value(v);
259        }
260        builder.finish(dtype)
261    }
262
263    /// Get value offset at a given index
264    ///
265    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
266    ///
267    /// Panics if index is out of bounds
268    pub fn offset_at(&self, index: usize) -> usize {
269        assert!(
270            index <= self.len(),
271            "Index {index} out of bounds 0..={}",
272            self.len()
273        );
274
275        self.offsets()
276            .scalar_at(index)
277            .as_ref()
278            .try_into()
279            .vortex_expect("Failed to convert offset to usize")
280    }
281
282    /// Access value bytes at a given index
283    ///
284    /// Will return buffer referencing underlying data without performing a copy
285    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
286        let start = self.offset_at(index);
287        let end = self.offset_at(index + 1);
288
289        self.bytes().slice(start..end)
290    }
291
292    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
293    /// the `offsets` array, and the `validity`.
294    pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
295        (self.dtype, self.bytes, self.offsets, self.validity)
296    }
297}
298
299impl From<Vec<&[u8]>> for VarBinArray {
300    fn from(value: Vec<&[u8]>) -> Self {
301        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
302    }
303}
304
305impl From<Vec<Vec<u8>>> for VarBinArray {
306    fn from(value: Vec<Vec<u8>>) -> Self {
307        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
308    }
309}
310
311impl From<Vec<String>> for VarBinArray {
312    fn from(value: Vec<String>) -> Self {
313        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
314    }
315}
316
317impl From<Vec<&str>> for VarBinArray {
318    fn from(value: Vec<&str>) -> Self {
319        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
320    }
321}
322
323impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
324    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
325        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
326    }
327}
328
329impl FromIterator<Option<Vec<u8>>> for VarBinArray {
330    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
331        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
332    }
333}
334
335impl FromIterator<Option<String>> for VarBinArray {
336    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
337        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
338    }
339}
340
341impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
342    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
343        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
344    }
345}