vortex_array/arrays/varbin/
mod.rs

1use std::fmt::Debug;
2
3pub use compute::compute_min_max;
4use num_traits::PrimInt;
5use vortex_buffer::ByteBuffer;
6use vortex_dtype::{DType, NativePType, Nullability};
7use vortex_error::{
8    VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err, vortex_panic,
9};
10use vortex_scalar::Scalar;
11
12use crate::arrays::varbin::builder::VarBinBuilder;
13use crate::stats::{ArrayStats, StatsSetRef};
14use crate::validity::Validity;
15use crate::vtable::{
16    ArrayVTable, NotSupported, VTable, ValidityHelper, ValidityVTableFromValidityHelper,
17};
18use crate::{Array, ArrayRef, EncodingId, EncodingRef, vtable};
19
20mod accessor;
21pub mod builder;
22mod canonical;
23mod compute;
24mod ops;
25mod serde;
26
27vtable!(VarBin);
28
29impl VTable for VarBinVTable {
30    type Array = VarBinArray;
31    type Encoding = VarBinEncoding;
32    type ArrayVTable = Self;
33    type CanonicalVTable = Self;
34    type OperationsVTable = Self;
35    type ValidityVTable = ValidityVTableFromValidityHelper;
36    type VisitorVTable = Self;
37    type ComputeVTable = NotSupported;
38    type EncodeVTable = NotSupported;
39    type SerdeVTable = Self;
40
41    fn id(_encoding: &Self::Encoding) -> EncodingId {
42        EncodingId::new_ref("vortex.varbin")
43    }
44
45    fn encoding(_array: &Self::Array) -> EncodingRef {
46        EncodingRef::new_ref(VarBinEncoding.as_ref())
47    }
48}
49
50#[derive(Clone, Debug)]
51pub struct VarBinArray {
52    dtype: DType,
53    bytes: ByteBuffer,
54    offsets: ArrayRef,
55    validity: Validity,
56    stats_set: ArrayStats,
57}
58
59#[derive(Clone, Debug)]
60pub struct VarBinEncoding;
61
62impl VarBinArray {
63    pub fn try_new(
64        offsets: ArrayRef,
65        bytes: ByteBuffer,
66        dtype: DType,
67        validity: Validity,
68    ) -> VortexResult<Self> {
69        if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
70            vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
71        }
72        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
73            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
74        }
75        if dtype.is_nullable() == (validity == Validity::NonNullable) {
76            vortex_bail!("incorrect validity {:?}", validity);
77        }
78
79        Ok(Self {
80            dtype,
81            bytes,
82            offsets,
83            validity,
84            stats_set: Default::default(),
85        })
86    }
87
88    #[inline]
89    pub fn offsets(&self) -> &ArrayRef {
90        &self.offsets
91    }
92
93    /// Access the value bytes child buffer
94    ///
95    /// # Note
96    ///
97    /// Bytes child buffer is never sliced when the array is sliced so this can include values
98    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
99    /// unless they're resolving values via the offset child array.
100    #[inline]
101    pub fn bytes(&self) -> &ByteBuffer {
102        &self.bytes
103    }
104
105    /// Access value bytes child array limited to values that are logically present in
106    /// the array unlike [bytes][Self::bytes].
107    pub fn sliced_bytes(&self) -> ByteBuffer {
108        let first_offset: usize = self.offset_at(0).vortex_expect("1st offset");
109        let last_offset = self.offset_at(self.len()).vortex_expect("Last offset");
110
111        self.bytes().slice(first_offset..last_offset)
112    }
113
114    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
115        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
116        if size < u32::MAX as usize {
117            Self::from_vec_sized::<u32, T>(vec, dtype)
118        } else {
119            Self::from_vec_sized::<u64, T>(vec, dtype)
120        }
121    }
122
123    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
124    where
125        O: NativePType + PrimInt,
126        T: AsRef<[u8]>,
127    {
128        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
129        for v in vec {
130            builder.append_value(v.as_ref());
131        }
132        builder.finish(dtype)
133    }
134
135    #[allow(clippy::same_name_method)]
136    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
137        iter: I,
138        dtype: DType,
139    ) -> Self {
140        let iter = iter.into_iter();
141        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
142        for v in iter {
143            builder.append(v.as_ref().map(|o| o.as_ref()));
144        }
145        builder.finish(dtype)
146    }
147
148    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
149        iter: I,
150        dtype: DType,
151    ) -> Self {
152        let iter = iter.into_iter();
153        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
154        for v in iter {
155            builder.append_value(v);
156        }
157        builder.finish(dtype)
158    }
159
160    /// Get value offset at a given index
161    ///
162    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
163    pub fn offset_at(&self, index: usize) -> VortexResult<usize> {
164        if index > self.len() + 1 {
165            vortex_bail!(OutOfBounds: index, 0, self.len() + 1)
166        }
167
168        // TODO(ngates): PrimitiveArrayTrait should have get_scalar(idx) -> Option<T> method
169        Ok(self
170            .offsets()
171            .scalar_at(index)
172            .unwrap_or_else(|err| vortex_panic!(err, "Failed to get offset at index: {}", index))
173            .as_ref()
174            .try_into()
175            .vortex_expect("Failed to convert offset to usize"))
176    }
177
178    /// Access value bytes at a given index
179    ///
180    /// Will return buffer referncing underlying data without performing a copy
181    pub fn bytes_at(&self, index: usize) -> VortexResult<ByteBuffer> {
182        let start = self.offset_at(index)?;
183        let end = self.offset_at(index + 1)?;
184
185        Ok(self.bytes().slice(start..end))
186    }
187
188    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
189    /// the `offsets` array, and the `validity`.
190    pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
191        (self.dtype, self.bytes, self.offsets, self.validity)
192    }
193}
194
195impl ValidityHelper for VarBinArray {
196    fn validity(&self) -> &Validity {
197        &self.validity
198    }
199}
200
201impl ArrayVTable<VarBinVTable> for VarBinVTable {
202    fn len(array: &VarBinArray) -> usize {
203        array.offsets().len().saturating_sub(1)
204    }
205
206    fn dtype(array: &VarBinArray) -> &DType {
207        &array.dtype
208    }
209
210    fn stats(array: &VarBinArray) -> StatsSetRef<'_> {
211        array.stats_set.to_ref(array.as_ref())
212    }
213}
214
215impl From<Vec<&[u8]>> for VarBinArray {
216    fn from(value: Vec<&[u8]>) -> Self {
217        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
218    }
219}
220
221impl From<Vec<Vec<u8>>> for VarBinArray {
222    fn from(value: Vec<Vec<u8>>) -> Self {
223        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
224    }
225}
226
227impl From<Vec<String>> for VarBinArray {
228    fn from(value: Vec<String>) -> Self {
229        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
230    }
231}
232
233impl From<Vec<&str>> for VarBinArray {
234    fn from(value: Vec<&str>) -> Self {
235        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
236    }
237}
238
239impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
240    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
241        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
242    }
243}
244
245impl FromIterator<Option<Vec<u8>>> for VarBinArray {
246    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
247        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
248    }
249}
250
251impl FromIterator<Option<String>> for VarBinArray {
252    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
253        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
254    }
255}
256
257impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
258    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
259        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
260    }
261}
262
263pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
264    if matches!(dtype, DType::Utf8(_)) {
265        Scalar::try_utf8(value, dtype.nullability())
266            .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
267            .vortex_unwrap()
268    } else {
269        Scalar::binary(value, dtype.nullability())
270    }
271}
272
273#[cfg(test)]
274mod test {
275    use rstest::{fixture, rstest};
276    use vortex_buffer::Buffer;
277    use vortex_dtype::{DType, Nullability};
278
279    use crate::arrays::primitive::PrimitiveArray;
280    use crate::arrays::varbin::VarBinArray;
281    use crate::validity::Validity;
282    use crate::{Array, ArrayRef, IntoArray};
283
284    #[fixture]
285    fn binary_array() -> ArrayRef {
286        let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
287        let offsets = PrimitiveArray::from_iter([0, 11, 44]);
288
289        VarBinArray::try_new(
290            offsets.into_array(),
291            values,
292            DType::Utf8(Nullability::NonNullable),
293            Validity::NonNullable,
294        )
295        .unwrap()
296        .into_array()
297    }
298
299    #[rstest]
300    pub fn test_scalar_at(binary_array: ArrayRef) {
301        assert_eq!(binary_array.len(), 2);
302        assert_eq!(binary_array.scalar_at(0).unwrap(), "hello world".into());
303        assert_eq!(
304            binary_array.scalar_at(1).unwrap(),
305            "hello world this is a long string".into()
306        )
307    }
308
309    #[rstest]
310    pub fn slice_array(binary_array: ArrayRef) {
311        let binary_arr = binary_array.slice(1, 2).unwrap();
312        assert_eq!(
313            binary_arr.scalar_at(0).unwrap(),
314            "hello world this is a long string".into()
315        );
316    }
317}