vortex_array/arrays/varbin/
mod.rs

1use std::fmt::Debug;
2
3pub use compute::compute_min_max;
4use num_traits::PrimInt;
5pub use stats::compute_varbin_statistics;
6use vortex_buffer::ByteBuffer;
7use vortex_dtype::{DType, NativePType, Nullability};
8use vortex_error::{
9    VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err, vortex_panic,
10};
11use vortex_mask::Mask;
12use vortex_scalar::Scalar;
13
14use crate::array::ArrayValidityImpl;
15use crate::arrays::varbin::builder::VarBinBuilder;
16use crate::arrays::varbin::serde::VarBinMetadata;
17use crate::compute::scalar_at;
18use crate::stats::{ArrayStats, StatsSetRef};
19use crate::validity::Validity;
20use crate::vtable::{EncodingVTable, VTableRef};
21use crate::{
22    Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, Encoding, EncodingId, RkyvMetadata,
23    try_from_array_ref,
24};
25
26mod accessor;
27pub mod builder;
28mod canonical;
29mod compute;
30mod serde;
31mod stats;
32mod variants;
33
34#[derive(Clone, Debug)]
35pub struct VarBinArray {
36    dtype: DType,
37    bytes: ByteBuffer,
38    offsets: ArrayRef,
39    validity: Validity,
40    stats_set: ArrayStats,
41}
42
43try_from_array_ref!(VarBinArray);
44
45pub struct VarBinEncoding;
46impl Encoding for VarBinEncoding {
47    type Array = VarBinArray;
48    type Metadata = RkyvMetadata<VarBinMetadata>;
49}
50
51impl EncodingVTable for VarBinEncoding {
52    fn id(&self) -> EncodingId {
53        EncodingId::new_ref("vortex.varbin")
54    }
55}
56
57impl VarBinArray {
58    pub fn try_new(
59        offsets: ArrayRef,
60        bytes: ByteBuffer,
61        dtype: DType,
62        validity: Validity,
63    ) -> VortexResult<Self> {
64        if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
65            vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
66        }
67        if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
68            vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
69        }
70        if dtype.is_nullable() == (validity == Validity::NonNullable) {
71            vortex_bail!("incorrect validity {:?}", validity);
72        }
73
74        Ok(Self {
75            dtype,
76            bytes,
77            offsets,
78            validity,
79            stats_set: Default::default(),
80        })
81    }
82
83    #[inline]
84    pub fn offsets(&self) -> &ArrayRef {
85        &self.offsets
86    }
87
88    pub fn validity(&self) -> &Validity {
89        &self.validity
90    }
91
92    /// Access the value bytes child buffer
93    ///
94    /// # Note
95    ///
96    /// Bytes child buffer is never sliced when the array is sliced so this can include values
97    /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes]
98    /// unless they're resolving values via the offset child array.
99    #[inline]
100    pub fn bytes(&self) -> &ByteBuffer {
101        &self.bytes
102    }
103
104    /// Access value bytes child array limited to values that are logically present in
105    /// the array unlike [bytes][Self::bytes].
106    pub fn sliced_bytes(&self) -> ByteBuffer {
107        let first_offset: usize = self.offset_at(0).vortex_expect("1st offset");
108        let last_offset = self.offset_at(self.len()).vortex_expect("Last offset");
109
110        self.bytes().slice(first_offset..last_offset)
111    }
112
113    pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
114        let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
115        if size < u32::MAX as usize {
116            Self::from_vec_sized::<u32, T>(vec, dtype)
117        } else {
118            Self::from_vec_sized::<u64, T>(vec, dtype)
119        }
120    }
121
122    fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
123    where
124        O: NativePType + PrimInt,
125        T: AsRef<[u8]>,
126    {
127        let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
128        for v in vec {
129            builder.append_value(v.as_ref());
130        }
131        builder.finish(dtype)
132    }
133
134    #[allow(clippy::same_name_method)]
135    pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
136        iter: I,
137        dtype: DType,
138    ) -> Self {
139        let iter = iter.into_iter();
140        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
141        for v in iter {
142            builder.append(v.as_ref().map(|o| o.as_ref()));
143        }
144        builder.finish(dtype)
145    }
146
147    pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
148        iter: I,
149        dtype: DType,
150    ) -> Self {
151        let iter = iter.into_iter();
152        let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
153        for v in iter {
154            builder.append_value(v);
155        }
156        builder.finish(dtype)
157    }
158
159    /// Get value offset at a given index
160    ///
161    /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index
162    pub fn offset_at(&self, index: usize) -> VortexResult<usize> {
163        if index > self.len() + 1 {
164            vortex_bail!(OutOfBounds: index, 0, self.len() + 1)
165        }
166
167        // TODO(ngates): PrimitiveArrayTrait should have get_scalar(idx) -> Option<T> method
168        Ok(scalar_at(self.offsets(), index)
169            .unwrap_or_else(|err| vortex_panic!(err, "Failed to get offset at index: {}", index))
170            .as_ref()
171            .try_into()
172            .vortex_expect("Failed to convert offset to usize"))
173    }
174
175    /// Access value bytes at a given index
176    ///
177    /// Will return buffer referncing underlying data without performing a copy
178    pub fn bytes_at(&self, index: usize) -> VortexResult<ByteBuffer> {
179        let start = self.offset_at(index)?;
180        let end = self.offset_at(index + 1)?;
181
182        Ok(self.bytes().slice(start..end))
183    }
184
185    /// Consumes self, returning a tuple containing the `DType`, the `bytes` array,
186    /// the `offsets` array, and the `validity`.
187    pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
188        (self.dtype, self.bytes, self.offsets, self.validity)
189    }
190}
191
192impl ArrayValidityImpl for VarBinArray {
193    fn _is_valid(&self, index: usize) -> VortexResult<bool> {
194        self.validity.is_valid(index)
195    }
196
197    fn _all_valid(&self) -> VortexResult<bool> {
198        self.validity.all_valid()
199    }
200
201    fn _all_invalid(&self) -> VortexResult<bool> {
202        self.validity.all_invalid()
203    }
204
205    fn _validity_mask(&self) -> VortexResult<Mask> {
206        self.validity.to_logical(self.len())
207    }
208}
209
210impl ArrayImpl for VarBinArray {
211    type Encoding = VarBinEncoding;
212
213    fn _len(&self) -> usize {
214        self.offsets().len().saturating_sub(1)
215    }
216
217    fn _dtype(&self) -> &DType {
218        &self.dtype
219    }
220
221    fn _vtable(&self) -> VTableRef {
222        VTableRef::new_ref(&VarBinEncoding)
223    }
224}
225
226impl ArrayStatisticsImpl for VarBinArray {
227    fn _stats_ref(&self) -> StatsSetRef<'_> {
228        self.stats_set.to_ref(self)
229    }
230}
231
232impl From<Vec<&[u8]>> for VarBinArray {
233    fn from(value: Vec<&[u8]>) -> Self {
234        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
235    }
236}
237
238impl From<Vec<Vec<u8>>> for VarBinArray {
239    fn from(value: Vec<Vec<u8>>) -> Self {
240        Self::from_vec(value, DType::Binary(Nullability::NonNullable))
241    }
242}
243
244impl From<Vec<String>> for VarBinArray {
245    fn from(value: Vec<String>) -> Self {
246        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
247    }
248}
249
250impl From<Vec<&str>> for VarBinArray {
251    fn from(value: Vec<&str>) -> Self {
252        Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
253    }
254}
255
256impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
257    fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
258        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
259    }
260}
261
262impl FromIterator<Option<Vec<u8>>> for VarBinArray {
263    fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
264        Self::from_iter(iter, DType::Binary(Nullability::Nullable))
265    }
266}
267
268impl FromIterator<Option<String>> for VarBinArray {
269    fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
270        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
271    }
272}
273
274impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
275    fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
276        Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
277    }
278}
279
280pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
281    if matches!(dtype, DType::Utf8(_)) {
282        Scalar::try_utf8(value, dtype.nullability())
283            .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
284            .vortex_unwrap()
285    } else {
286        Scalar::binary(value, dtype.nullability())
287    }
288}
289
290#[cfg(test)]
291mod test {
292    use rstest::{fixture, rstest};
293    use vortex_buffer::Buffer;
294    use vortex_dtype::{DType, Nullability};
295
296    use crate::ArrayRef;
297    use crate::array::Array;
298    use crate::arrays::primitive::PrimitiveArray;
299    use crate::arrays::varbin::VarBinArray;
300    use crate::compute::{scalar_at, slice};
301    use crate::validity::Validity;
302
303    #[fixture]
304    fn binary_array() -> ArrayRef {
305        let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
306        let offsets = PrimitiveArray::from_iter([0, 11, 44]);
307
308        VarBinArray::try_new(
309            offsets.into_array(),
310            values,
311            DType::Utf8(Nullability::NonNullable),
312            Validity::NonNullable,
313        )
314        .unwrap()
315        .into_array()
316    }
317
318    #[rstest]
319    pub fn test_scalar_at(binary_array: ArrayRef) {
320        assert_eq!(binary_array.len(), 2);
321        assert_eq!(scalar_at(&binary_array, 0).unwrap(), "hello world".into());
322        assert_eq!(
323            scalar_at(&binary_array, 1).unwrap(),
324            "hello world this is a long string".into()
325        )
326    }
327
328    #[rstest]
329    pub fn slice_array(binary_array: ArrayRef) {
330        let binary_arr = slice(&binary_array, 1, 2).unwrap();
331        assert_eq!(
332            scalar_at(&binary_arr, 0).unwrap(),
333            "hello world this is a long string".into()
334        );
335    }
336}