vortex_array/arrays/varbin/
mod.rs

1use std::fmt::Debug;
2
3pub use compute::compute_min_max;
4use num_traits::PrimInt;
5use vortex_buffer::ByteBuffer;
6use vortex_dtype::{DType, NativePType, Nullability};
7use vortex_error::{
8    VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err, vortex_panic,
9};
10use vortex_mask::Mask;
11use vortex_scalar::Scalar;
12
13use crate::array::ArrayValidityImpl;
14use crate::arrays::varbin::builder::VarBinBuilder;
15use crate::arrays::varbin::serde::VarBinMetadata;
16use crate::compute::scalar_at;
17use crate::stats::{ArrayStats, StatsSetRef};
18use crate::validity::Validity;
19use crate::vtable::VTableRef;
20use crate::{
21    Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, Encoding, ProstMetadata, try_from_array_ref,
22};
23
24mod accessor;
25pub mod builder;
26mod canonical;
27mod compute;
28mod serde;
29mod variants;
30
31#[derive(Clone, Debug)]
32pub struct VarBinArray {
33    dtype: DType,
34    bytes: ByteBuffer,
35    offsets: ArrayRef,
36    validity: Validity,
37    stats_set: ArrayStats,
38}
39
40try_from_array_ref!(VarBinArray);
41
42#[derive(Debug)]
43pub struct VarBinEncoding;
44impl Encoding for VarBinEncoding {
45    type Array = VarBinArray;
46    type Metadata = ProstMetadata<VarBinMetadata>;
47}
48
49impl VarBinArray {
50    pub fn try_new(
51        offsets: ArrayRef,
52        bytes: ByteBuffer,
53        dtype: DType,
54        validity: Validity,
55    ) -> VortexResult<Self> {
56        if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
57            vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
58        }
59        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
60            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
61        }
62        if dtype.is_nullable() == (validity == Validity::NonNullable) {
63            vortex_bail!("incorrect validity {:?}", validity);
64        }
65
66        Ok(Self {
67            dtype,
68            bytes,
69            offsets,
70            validity,
71            stats_set: Default::default(),
72        })
73    }
74
75    #[inline]
76    pub fn offsets(&self) -> &ArrayRef {
77        &self.offsets
78    }
79
80    pub fn validity(&self) -> &Validity {
81        &self.validity
82    }
83
84    /// Access the value bytes child buffer
85    ///
86    /// # Note
87    ///
88    /// Bytes child buffer is never sliced when the array is sliced so this can include values
89    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
90    /// unless they're resolving values via the offset child array.
91    #[inline]
92    pub fn bytes(&self) -> &ByteBuffer {
93        &self.bytes
94    }
95
96    /// Access value bytes child array limited to values that are logically present in
97    /// the array unlike [bytes][Self::bytes].
98    pub fn sliced_bytes(&self) -> ByteBuffer {
99        let first_offset: usize = self.offset_at(0).vortex_expect("1st offset");
100        let last_offset = self.offset_at(self.len()).vortex_expect("Last offset");
101
102        self.bytes().slice(first_offset..last_offset)
103    }
104
105    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
106        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
107        if size < u32::MAX as usize {
108            Self::from_vec_sized::<u32, T>(vec, dtype)
109        } else {
110            Self::from_vec_sized::<u64, T>(vec, dtype)
111        }
112    }
113
114    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
115    where
116        O: NativePType + PrimInt,
117        T: AsRef<[u8]>,
118    {
119        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
120        for v in vec {
121            builder.append_value(v.as_ref());
122        }
123        builder.finish(dtype)
124    }
125
126    #[allow(clippy::same_name_method)]
127    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
128        iter: I,
129        dtype: DType,
130    ) -> Self {
131        let iter = iter.into_iter();
132        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
133        for v in iter {
134            builder.append(v.as_ref().map(|o| o.as_ref()));
135        }
136        builder.finish(dtype)
137    }
138
139    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
140        iter: I,
141        dtype: DType,
142    ) -> Self {
143        let iter = iter.into_iter();
144        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
145        for v in iter {
146            builder.append_value(v);
147        }
148        builder.finish(dtype)
149    }
150
151    /// Get value offset at a given index
152    ///
153    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
154    pub fn offset_at(&self, index: usize) -> VortexResult<usize> {
155        if index > self.len() + 1 {
156            vortex_bail!(OutOfBounds: index, 0, self.len() + 1)
157        }
158
159        // TODO(ngates): PrimitiveArrayTrait should have get_scalar(idx) -> Option<T> method
160        Ok(scalar_at(self.offsets(), index)
161            .unwrap_or_else(|err| vortex_panic!(err, "Failed to get offset at index: {}", index))
162            .as_ref()
163            .try_into()
164            .vortex_expect("Failed to convert offset to usize"))
165    }
166
167    /// Access value bytes at a given index
168    ///
169    /// Will return buffer referncing underlying data without performing a copy
170    pub fn bytes_at(&self, index: usize) -> VortexResult<ByteBuffer> {
171        let start = self.offset_at(index)?;
172        let end = self.offset_at(index + 1)?;
173
174        Ok(self.bytes().slice(start..end))
175    }
176
177    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
178    /// the `offsets` array, and the `validity`.
179    pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
180        (self.dtype, self.bytes, self.offsets, self.validity)
181    }
182}
183
184impl ArrayValidityImpl for VarBinArray {
185    fn _is_valid(&self, index: usize) -> VortexResult<bool> {
186        self.validity.is_valid(index)
187    }
188
189    fn _all_valid(&self) -> VortexResult<bool> {
190        self.validity.all_valid()
191    }
192
193    fn _all_invalid(&self) -> VortexResult<bool> {
194        self.validity.all_invalid()
195    }
196
197    fn _validity_mask(&self) -> VortexResult<Mask> {
198        self.validity.to_mask(self.len())
199    }
200}
201
202impl ArrayImpl for VarBinArray {
203    type Encoding = VarBinEncoding;
204
205    fn _len(&self) -> usize {
206        self.offsets().len().saturating_sub(1)
207    }
208
209    fn _dtype(&self) -> &DType {
210        &self.dtype
211    }
212
213    fn _vtable(&self) -> VTableRef {
214        VTableRef::new_ref(&VarBinEncoding)
215    }
216
217    fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
218        let new = match children.len() {
219            // Only the offsets array is mandatory
220            1 => {
221                let offsets = children[0].clone();
222                Self::try_new(
223                    offsets,
224                    self.bytes().clone(),
225                    self.dtype().clone(),
226                    self.validity().clone(),
227                )?
228            }
229            // If are provided with both an offsets and validity arrays
230            2 => {
231                let offsets = children[0].clone();
232                let validity_array = children[1].clone();
233                Self::try_new(
234                    offsets,
235                    self.bytes().clone(),
236                    self.dtype().clone(),
237                    Validity::Array(validity_array),
238                )?
239            }
240            _ => vortex_bail!("unexpected number of new children"),
241        };
242
243        Ok(new)
244    }
245}
246
247impl ArrayStatisticsImpl for VarBinArray {
248    fn _stats_ref(&self) -> StatsSetRef<'_> {
249        self.stats_set.to_ref(self)
250    }
251}
252
253impl From<Vec<&[u8]>> for VarBinArray {
254    fn from(value: Vec<&[u8]>) -> Self {
255        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
256    }
257}
258
259impl From<Vec<Vec<u8>>> for VarBinArray {
260    fn from(value: Vec<Vec<u8>>) -> Self {
261        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
262    }
263}
264
265impl From<Vec<String>> for VarBinArray {
266    fn from(value: Vec<String>) -> Self {
267        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
268    }
269}
270
271impl From<Vec<&str>> for VarBinArray {
272    fn from(value: Vec<&str>) -> Self {
273        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
274    }
275}
276
277impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
278    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
279        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
280    }
281}
282
283impl FromIterator<Option<Vec<u8>>> for VarBinArray {
284    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
285        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
286    }
287}
288
289impl FromIterator<Option<String>> for VarBinArray {
290    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
291        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
292    }
293}
294
295impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
296    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
297        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
298    }
299}
300
301pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
302    if matches!(dtype, DType::Utf8(_)) {
303        Scalar::try_utf8(value, dtype.nullability())
304            .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
305            .vortex_unwrap()
306    } else {
307        Scalar::binary(value, dtype.nullability())
308    }
309}
310
311#[cfg(test)]
312mod test {
313    use rstest::{fixture, rstest};
314    use vortex_buffer::Buffer;
315    use vortex_dtype::{DType, Nullability};
316
317    use crate::ArrayRef;
318    use crate::array::Array;
319    use crate::arrays::primitive::PrimitiveArray;
320    use crate::arrays::varbin::VarBinArray;
321    use crate::compute::{scalar_at, slice};
322    use crate::validity::Validity;
323
324    #[fixture]
325    fn binary_array() -> ArrayRef {
326        let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
327        let offsets = PrimitiveArray::from_iter([0, 11, 44]);
328
329        VarBinArray::try_new(
330            offsets.into_array(),
331            values,
332            DType::Utf8(Nullability::NonNullable),
333            Validity::NonNullable,
334        )
335        .unwrap()
336        .into_array()
337    }
338
339    #[rstest]
340    pub fn test_scalar_at(binary_array: ArrayRef) {
341        assert_eq!(binary_array.len(), 2);
342        assert_eq!(scalar_at(&binary_array, 0).unwrap(), "hello world".into());
343        assert_eq!(
344            scalar_at(&binary_array, 1).unwrap(),
345            "hello world this is a long string".into()
346        )
347    }
348
349    #[rstest]
350    pub fn slice_array(binary_array: ArrayRef) {
351        let binary_arr = slice(&binary_array, 1, 2).unwrap();
352        assert_eq!(
353            scalar_at(&binary_arr, 0).unwrap(),
354            "hello world this is a long string".into()
355        );
356    }
357}