vortex_array/arrays/varbin/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5
6pub use compute::compute_min_max;
7use num_traits::PrimInt;
8use vortex_buffer::ByteBuffer;
9use vortex_dtype::{DType, NativePType, Nullability};
10use vortex_error::{
11    VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err, vortex_panic,
12};
13use vortex_scalar::Scalar;
14
15use crate::arrays::varbin::builder::VarBinBuilder;
16use crate::stats::{ArrayStats, StatsSetRef};
17use crate::validity::Validity;
18use crate::vtable::{
19    ArrayVTable, NotSupported, VTable, ValidityHelper, ValidityVTableFromValidityHelper,
20};
21use crate::{Array, ArrayRef, EncodingId, EncodingRef, vtable};
22
23mod accessor;
24pub mod builder;
25mod canonical;
26mod compute;
27mod ops;
28mod serde;
29
30vtable!(VarBin);
31
32impl VTable for VarBinVTable {
33    type Array = VarBinArray;
34    type Encoding = VarBinEncoding;
35    type ArrayVTable = Self;
36    type CanonicalVTable = Self;
37    type OperationsVTable = Self;
38    type ValidityVTable = ValidityVTableFromValidityHelper;
39    type VisitorVTable = Self;
40    type ComputeVTable = NotSupported;
41    type EncodeVTable = NotSupported;
42    type SerdeVTable = Self;
43
44    fn id(_encoding: &Self::Encoding) -> EncodingId {
45        EncodingId::new_ref("vortex.varbin")
46    }
47
48    fn encoding(_array: &Self::Array) -> EncodingRef {
49        EncodingRef::new_ref(VarBinEncoding.as_ref())
50    }
51}
52
53#[derive(Clone, Debug)]
54pub struct VarBinArray {
55    dtype: DType,
56    bytes: ByteBuffer,
57    offsets: ArrayRef,
58    validity: Validity,
59    stats_set: ArrayStats,
60}
61
62#[derive(Clone, Debug)]
63pub struct VarBinEncoding;
64
65impl VarBinArray {
66    pub fn try_new(
67        offsets: ArrayRef,
68        bytes: ByteBuffer,
69        dtype: DType,
70        validity: Validity,
71    ) -> VortexResult<Self> {
72        if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
73            vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
74        }
75        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
76            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
77        }
78        if dtype.is_nullable() == (validity == Validity::NonNullable) {
79            vortex_bail!("incorrect validity {:?}", validity);
80        }
81
82        Ok(Self {
83            dtype,
84            bytes,
85            offsets,
86            validity,
87            stats_set: Default::default(),
88        })
89    }
90
91    #[inline]
92    pub fn offsets(&self) -> &ArrayRef {
93        &self.offsets
94    }
95
96    /// Access the value bytes child buffer
97    ///
98    /// # Note
99    ///
100    /// Bytes child buffer is never sliced when the array is sliced so this can include values
101    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
102    /// unless they're resolving values via the offset child array.
103    #[inline]
104    pub fn bytes(&self) -> &ByteBuffer {
105        &self.bytes
106    }
107
108    /// Access value bytes child array limited to values that are logically present in
109    /// the array unlike [bytes][Self::bytes].
110    pub fn sliced_bytes(&self) -> ByteBuffer {
111        let first_offset: usize = self.offset_at(0).vortex_expect("1st offset");
112        let last_offset = self.offset_at(self.len()).vortex_expect("Last offset");
113
114        self.bytes().slice(first_offset..last_offset)
115    }
116
117    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
118        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
119        if size < u32::MAX as usize {
120            Self::from_vec_sized::<u32, T>(vec, dtype)
121        } else {
122            Self::from_vec_sized::<u64, T>(vec, dtype)
123        }
124    }
125
126    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
127    where
128        O: NativePType + PrimInt,
129        T: AsRef<[u8]>,
130    {
131        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
132        for v in vec {
133            builder.append_value(v.as_ref());
134        }
135        builder.finish(dtype)
136    }
137
138    #[allow(clippy::same_name_method)]
139    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
140        iter: I,
141        dtype: DType,
142    ) -> Self {
143        let iter = iter.into_iter();
144        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
145        for v in iter {
146            builder.append(v.as_ref().map(|o| o.as_ref()));
147        }
148        builder.finish(dtype)
149    }
150
151    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
152        iter: I,
153        dtype: DType,
154    ) -> Self {
155        let iter = iter.into_iter();
156        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
157        for v in iter {
158            builder.append_value(v);
159        }
160        builder.finish(dtype)
161    }
162
163    /// Get value offset at a given index
164    ///
165    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
166    pub fn offset_at(&self, index: usize) -> VortexResult<usize> {
167        if index > self.len() + 1 {
168            vortex_bail!(OutOfBounds: index, 0, self.len() + 1)
169        }
170
171        // TODO(ngates): PrimitiveArrayTrait should have get_scalar(idx) -> Option<T> method
172        Ok(self
173            .offsets()
174            .scalar_at(index)
175            .unwrap_or_else(|err| vortex_panic!(err, "Failed to get offset at index: {}", index))
176            .as_ref()
177            .try_into()
178            .vortex_expect("Failed to convert offset to usize"))
179    }
180
181    /// Access value bytes at a given index
182    ///
183    /// Will return buffer referncing underlying data without performing a copy
184    pub fn bytes_at(&self, index: usize) -> VortexResult<ByteBuffer> {
185        let start = self.offset_at(index)?;
186        let end = self.offset_at(index + 1)?;
187
188        Ok(self.bytes().slice(start..end))
189    }
190
191    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
192    /// the `offsets` array, and the `validity`.
193    pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
194        (self.dtype, self.bytes, self.offsets, self.validity)
195    }
196}
197
198impl ValidityHelper for VarBinArray {
199    fn validity(&self) -> &Validity {
200        &self.validity
201    }
202}
203
204impl ArrayVTable<VarBinVTable> for VarBinVTable {
205    fn len(array: &VarBinArray) -> usize {
206        array.offsets().len().saturating_sub(1)
207    }
208
209    fn dtype(array: &VarBinArray) -> &DType {
210        &array.dtype
211    }
212
213    fn stats(array: &VarBinArray) -> StatsSetRef<'_> {
214        array.stats_set.to_ref(array.as_ref())
215    }
216}
217
218impl From<Vec<&[u8]>> for VarBinArray {
219    fn from(value: Vec<&[u8]>) -> Self {
220        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
221    }
222}
223
224impl From<Vec<Vec<u8>>> for VarBinArray {
225    fn from(value: Vec<Vec<u8>>) -> Self {
226        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
227    }
228}
229
230impl From<Vec<String>> for VarBinArray {
231    fn from(value: Vec<String>) -> Self {
232        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
233    }
234}
235
236impl From<Vec<&str>> for VarBinArray {
237    fn from(value: Vec<&str>) -> Self {
238        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
239    }
240}
241
242impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
243    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
244        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
245    }
246}
247
248impl FromIterator<Option<Vec<u8>>> for VarBinArray {
249    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
250        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
251    }
252}
253
254impl FromIterator<Option<String>> for VarBinArray {
255    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
256        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
257    }
258}
259
260impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
261    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
262        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
263    }
264}
265
266pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
267    if matches!(dtype, DType::Utf8(_)) {
268        Scalar::try_utf8(value, dtype.nullability())
269            .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
270            .vortex_unwrap()
271    } else {
272        Scalar::binary(value, dtype.nullability())
273    }
274}
275
276#[cfg(test)]
277mod test {
278    use rstest::{fixture, rstest};
279    use vortex_buffer::Buffer;
280    use vortex_dtype::{DType, Nullability};
281
282    use crate::arrays::primitive::PrimitiveArray;
283    use crate::arrays::varbin::VarBinArray;
284    use crate::validity::Validity;
285    use crate::{Array, ArrayRef, IntoArray};
286
287    #[fixture]
288    fn binary_array() -> ArrayRef {
289        let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
290        let offsets = PrimitiveArray::from_iter([0, 11, 44]);
291
292        VarBinArray::try_new(
293            offsets.into_array(),
294            values,
295            DType::Utf8(Nullability::NonNullable),
296            Validity::NonNullable,
297        )
298        .unwrap()
299        .into_array()
300    }
301
302    #[rstest]
303    pub fn test_scalar_at(binary_array: ArrayRef) {
304        assert_eq!(binary_array.len(), 2);
305        assert_eq!(binary_array.scalar_at(0).unwrap(), "hello world".into());
306        assert_eq!(
307            binary_array.scalar_at(1).unwrap(),
308            "hello world this is a long string".into()
309        )
310    }
311
312    #[rstest]
313    pub fn slice_array(binary_array: ArrayRef) {
314        let binary_arr = binary_array.slice(1, 2).unwrap();
315        assert_eq!(
316            binary_arr.scalar_at(0).unwrap(),
317            "hello world this is a long string".into()
318        );
319    }
320}