vortex_array/arrays/varbin/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5
6pub(crate) use compute::compute_min_max;
7use num_traits::PrimInt;
8use vortex_buffer::ByteBuffer;
9use vortex_dtype::{DType, NativePType, Nullability};
10use vortex_error::{VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err};
11use vortex_scalar::Scalar;
12
13use crate::arrays::varbin::builder::VarBinBuilder;
14use crate::stats::{ArrayStats, StatsSetRef};
15use crate::validity::Validity;
16use crate::vtable::{
17    ArrayVTable, NotSupported, VTable, ValidityHelper, ValidityVTableFromValidityHelper,
18};
19use crate::{Array, ArrayRef, EncodingId, EncodingRef, vtable};
20
21mod accessor;
22pub mod builder;
23mod canonical;
24mod compute;
25mod ops;
26mod serde;
27
28vtable!(VarBin);
29
30impl VTable for VarBinVTable {
31    type Array = VarBinArray;
32    type Encoding = VarBinEncoding;
33    type ArrayVTable = Self;
34    type CanonicalVTable = Self;
35    type OperationsVTable = Self;
36    type ValidityVTable = ValidityVTableFromValidityHelper;
37    type VisitorVTable = Self;
38    type ComputeVTable = NotSupported;
39    type EncodeVTable = NotSupported;
40    type PipelineVTable = NotSupported;
41    type SerdeVTable = Self;
42
43    fn id(_encoding: &Self::Encoding) -> EncodingId {
44        EncodingId::new_ref("vortex.varbin")
45    }
46
47    fn encoding(_array: &Self::Array) -> EncodingRef {
48        EncodingRef::new_ref(VarBinEncoding.as_ref())
49    }
50}
51
52#[derive(Clone, Debug)]
53pub struct VarBinArray {
54    dtype: DType,
55    bytes: ByteBuffer,
56    offsets: ArrayRef,
57    validity: Validity,
58    stats_set: ArrayStats,
59}
60
61#[derive(Clone, Debug)]
62pub struct VarBinEncoding;
63
64impl VarBinArray {
65    pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
66        Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
67    }
68
69    pub fn try_new(
70        offsets: ArrayRef,
71        bytes: ByteBuffer,
72        dtype: DType,
73        validity: Validity,
74    ) -> VortexResult<Self> {
75        if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
76            vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
77        }
78        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
79            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
80        }
81        if dtype.is_nullable() == (validity == Validity::NonNullable) {
82            vortex_bail!("incorrect validity {:?}", validity);
83        }
84
85        Ok(Self {
86            dtype,
87            bytes,
88            offsets,
89            validity,
90            stats_set: Default::default(),
91        })
92    }
93
94    #[inline]
95    pub fn offsets(&self) -> &ArrayRef {
96        &self.offsets
97    }
98
99    /// Access the value bytes child buffer
100    ///
101    /// # Note
102    ///
103    /// Bytes child buffer is never sliced when the array is sliced so this can include values
104    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
105    /// unless they're resolving values via the offset child array.
106    #[inline]
107    pub fn bytes(&self) -> &ByteBuffer {
108        &self.bytes
109    }
110
111    /// Access value bytes child array limited to values that are logically present in
112    /// the array unlike [bytes][Self::bytes].
113    pub fn sliced_bytes(&self) -> ByteBuffer {
114        let first_offset: usize = self.offset_at(0);
115        let last_offset = self.offset_at(self.len());
116
117        self.bytes().slice(first_offset..last_offset)
118    }
119
120    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
121        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
122        if size < u32::MAX as usize {
123            Self::from_vec_sized::<u32, T>(vec, dtype)
124        } else {
125            Self::from_vec_sized::<u64, T>(vec, dtype)
126        }
127    }
128
129    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
130    where
131        O: NativePType + PrimInt,
132        T: AsRef<[u8]>,
133    {
134        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
135        for v in vec {
136            builder.append_value(v.as_ref());
137        }
138        builder.finish(dtype)
139    }
140
141    #[allow(clippy::same_name_method)]
142    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
143        iter: I,
144        dtype: DType,
145    ) -> Self {
146        let iter = iter.into_iter();
147        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
148        for v in iter {
149            builder.append(v.as_ref().map(|o| o.as_ref()));
150        }
151        builder.finish(dtype)
152    }
153
154    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
155        iter: I,
156        dtype: DType,
157    ) -> Self {
158        let iter = iter.into_iter();
159        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
160        for v in iter {
161            builder.append_value(v);
162        }
163        builder.finish(dtype)
164    }
165
166    /// Get value offset at a given index
167    ///
168    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
169    ///
170    /// Panics if index is out of bounds
171    pub fn offset_at(&self, index: usize) -> usize {
172        assert!(
173            index <= self.len(),
174            "Index {index} out of bounds 0..={}",
175            self.len()
176        );
177
178        self.offsets()
179            .scalar_at(index)
180            .as_ref()
181            .try_into()
182            .vortex_expect("Failed to convert offset to usize")
183    }
184
185    /// Access value bytes at a given index
186    ///
187    /// Will return buffer referencing underlying data without performing a copy
188    pub fn bytes_at(&self, index: usize) -> ByteBuffer {
189        let start = self.offset_at(index);
190        let end = self.offset_at(index + 1);
191
192        self.bytes().slice(start..end)
193    }
194
195    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
196    /// the `offsets` array, and the `validity`.
197    pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
198        (self.dtype, self.bytes, self.offsets, self.validity)
199    }
200}
201
202impl ValidityHelper for VarBinArray {
203    fn validity(&self) -> &Validity {
204        &self.validity
205    }
206}
207
208impl ArrayVTable<VarBinVTable> for VarBinVTable {
209    fn len(array: &VarBinArray) -> usize {
210        array.offsets().len().saturating_sub(1)
211    }
212
213    fn dtype(array: &VarBinArray) -> &DType {
214        &array.dtype
215    }
216
217    fn stats(array: &VarBinArray) -> StatsSetRef<'_> {
218        array.stats_set.to_ref(array.as_ref())
219    }
220}
221
222impl From<Vec<&[u8]>> for VarBinArray {
223    fn from(value: Vec<&[u8]>) -> Self {
224        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
225    }
226}
227
228impl From<Vec<Vec<u8>>> for VarBinArray {
229    fn from(value: Vec<Vec<u8>>) -> Self {
230        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
231    }
232}
233
234impl From<Vec<String>> for VarBinArray {
235    fn from(value: Vec<String>) -> Self {
236        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
237    }
238}
239
240impl From<Vec<&str>> for VarBinArray {
241    fn from(value: Vec<&str>) -> Self {
242        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
243    }
244}
245
246impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
247    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
248        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
249    }
250}
251
252impl FromIterator<Option<Vec<u8>>> for VarBinArray {
253    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
254        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
255    }
256}
257
258impl FromIterator<Option<String>> for VarBinArray {
259    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
260        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
261    }
262}
263
264impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
265    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
266        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
267    }
268}
269
270pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
271    if matches!(dtype, DType::Utf8(_)) {
272        Scalar::try_utf8(value, dtype.nullability())
273            .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
274            .vortex_unwrap()
275    } else {
276        Scalar::binary(value, dtype.nullability())
277    }
278}
279
280#[cfg(test)]
281mod test {
282    use rstest::{fixture, rstest};
283    use vortex_buffer::Buffer;
284    use vortex_dtype::{DType, Nullability};
285
286    use crate::arrays::primitive::PrimitiveArray;
287    use crate::arrays::varbin::VarBinArray;
288    use crate::validity::Validity;
289    use crate::{Array, ArrayRef, IntoArray};
290
291    #[fixture]
292    fn binary_array() -> ArrayRef {
293        let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
294        let offsets = PrimitiveArray::from_iter([0, 11, 44]);
295
296        VarBinArray::try_new(
297            offsets.into_array(),
298            values,
299            DType::Utf8(Nullability::NonNullable),
300            Validity::NonNullable,
301        )
302        .unwrap()
303        .into_array()
304    }
305
306    #[rstest]
307    pub fn test_scalar_at(binary_array: ArrayRef) {
308        assert_eq!(binary_array.len(), 2);
309        assert_eq!(binary_array.scalar_at(0), "hello world".into());
310        assert_eq!(
311            binary_array.scalar_at(1),
312            "hello world this is a long string".into()
313        )
314    }
315
316    #[rstest]
317    pub fn slice_array(binary_array: ArrayRef) {
318        let binary_arr = binary_array.slice(1, 2);
319        assert_eq!(
320            binary_arr.scalar_at(0),
321            "hello world this is a long string".into()
322        );
323    }
324}