vortex_array/arrays/varbin/
mod.rs1use std::fmt::Debug;
5
6pub use compute::compute_min_max;
7use num_traits::PrimInt;
8use vortex_buffer::ByteBuffer;
9use vortex_dtype::{DType, NativePType, Nullability};
10use vortex_error::{VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err};
11use vortex_scalar::Scalar;
12
13use crate::arrays::varbin::builder::VarBinBuilder;
14use crate::stats::{ArrayStats, StatsSetRef};
15use crate::validity::Validity;
16use crate::vtable::{
17 ArrayVTable, NotSupported, VTable, ValidityHelper, ValidityVTableFromValidityHelper,
18};
19use crate::{Array, ArrayRef, EncodingId, EncodingRef, vtable};
20
21mod accessor;
22pub mod builder;
23mod canonical;
24mod compute;
25mod ops;
26mod serde;
27
28vtable!(VarBin);
29
30impl VTable for VarBinVTable {
31 type Array = VarBinArray;
32 type Encoding = VarBinEncoding;
33 type ArrayVTable = Self;
34 type CanonicalVTable = Self;
35 type OperationsVTable = Self;
36 type ValidityVTable = ValidityVTableFromValidityHelper;
37 type VisitorVTable = Self;
38 type ComputeVTable = NotSupported;
39 type EncodeVTable = NotSupported;
40 type SerdeVTable = Self;
41
42 fn id(_encoding: &Self::Encoding) -> EncodingId {
43 EncodingId::new_ref("vortex.varbin")
44 }
45
46 fn encoding(_array: &Self::Array) -> EncodingRef {
47 EncodingRef::new_ref(VarBinEncoding.as_ref())
48 }
49}
50
51#[derive(Clone, Debug)]
52pub struct VarBinArray {
53 dtype: DType,
54 bytes: ByteBuffer,
55 offsets: ArrayRef,
56 validity: Validity,
57 stats_set: ArrayStats,
58}
59
60#[derive(Clone, Debug)]
61pub struct VarBinEncoding;
62
63impl VarBinArray {
64 pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
65 Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
66 }
67
68 pub fn try_new(
69 offsets: ArrayRef,
70 bytes: ByteBuffer,
71 dtype: DType,
72 validity: Validity,
73 ) -> VortexResult<Self> {
74 if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
75 vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
76 }
77 if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
78 vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
79 }
80 if dtype.is_nullable() == (validity == Validity::NonNullable) {
81 vortex_bail!("incorrect validity {:?}", validity);
82 }
83
84 Ok(Self {
85 dtype,
86 bytes,
87 offsets,
88 validity,
89 stats_set: Default::default(),
90 })
91 }
92
93 #[inline]
94 pub fn offsets(&self) -> &ArrayRef {
95 &self.offsets
96 }
97
98 #[inline]
106 pub fn bytes(&self) -> &ByteBuffer {
107 &self.bytes
108 }
109
110 pub fn sliced_bytes(&self) -> ByteBuffer {
113 let first_offset: usize = self.offset_at(0);
114 let last_offset = self.offset_at(self.len());
115
116 self.bytes().slice(first_offset..last_offset)
117 }
118
119 pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
120 let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
121 if size < u32::MAX as usize {
122 Self::from_vec_sized::<u32, T>(vec, dtype)
123 } else {
124 Self::from_vec_sized::<u64, T>(vec, dtype)
125 }
126 }
127
128 fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
129 where
130 O: NativePType + PrimInt,
131 T: AsRef<[u8]>,
132 {
133 let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
134 for v in vec {
135 builder.append_value(v.as_ref());
136 }
137 builder.finish(dtype)
138 }
139
140 #[allow(clippy::same_name_method)]
141 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
142 iter: I,
143 dtype: DType,
144 ) -> Self {
145 let iter = iter.into_iter();
146 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
147 for v in iter {
148 builder.append(v.as_ref().map(|o| o.as_ref()));
149 }
150 builder.finish(dtype)
151 }
152
153 pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
154 iter: I,
155 dtype: DType,
156 ) -> Self {
157 let iter = iter.into_iter();
158 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
159 for v in iter {
160 builder.append_value(v);
161 }
162 builder.finish(dtype)
163 }
164
165 pub fn offset_at(&self, index: usize) -> usize {
171 assert!(
172 index <= self.len(),
173 "Index {index} out of bounds 0..={}",
174 self.len()
175 );
176
177 self.offsets()
178 .scalar_at(index)
179 .as_ref()
180 .try_into()
181 .vortex_expect("Failed to convert offset to usize")
182 }
183
184 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
188 let start = self.offset_at(index);
189 let end = self.offset_at(index + 1);
190
191 self.bytes().slice(start..end)
192 }
193
194 pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
197 (self.dtype, self.bytes, self.offsets, self.validity)
198 }
199}
200
201impl ValidityHelper for VarBinArray {
202 fn validity(&self) -> &Validity {
203 &self.validity
204 }
205}
206
207impl ArrayVTable<VarBinVTable> for VarBinVTable {
208 fn len(array: &VarBinArray) -> usize {
209 array.offsets().len().saturating_sub(1)
210 }
211
212 fn dtype(array: &VarBinArray) -> &DType {
213 &array.dtype
214 }
215
216 fn stats(array: &VarBinArray) -> StatsSetRef<'_> {
217 array.stats_set.to_ref(array.as_ref())
218 }
219}
220
221impl From<Vec<&[u8]>> for VarBinArray {
222 fn from(value: Vec<&[u8]>) -> Self {
223 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
224 }
225}
226
227impl From<Vec<Vec<u8>>> for VarBinArray {
228 fn from(value: Vec<Vec<u8>>) -> Self {
229 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
230 }
231}
232
233impl From<Vec<String>> for VarBinArray {
234 fn from(value: Vec<String>) -> Self {
235 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
236 }
237}
238
239impl From<Vec<&str>> for VarBinArray {
240 fn from(value: Vec<&str>) -> Self {
241 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
242 }
243}
244
245impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
246 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
247 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
248 }
249}
250
251impl FromIterator<Option<Vec<u8>>> for VarBinArray {
252 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
253 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
254 }
255}
256
257impl FromIterator<Option<String>> for VarBinArray {
258 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
259 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
260 }
261}
262
263impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
264 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
265 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
266 }
267}
268
269pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
270 if matches!(dtype, DType::Utf8(_)) {
271 Scalar::try_utf8(value, dtype.nullability())
272 .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
273 .vortex_unwrap()
274 } else {
275 Scalar::binary(value, dtype.nullability())
276 }
277}
278
279#[cfg(test)]
280mod test {
281 use rstest::{fixture, rstest};
282 use vortex_buffer::Buffer;
283 use vortex_dtype::{DType, Nullability};
284
285 use crate::arrays::primitive::PrimitiveArray;
286 use crate::arrays::varbin::VarBinArray;
287 use crate::validity::Validity;
288 use crate::{Array, ArrayRef, IntoArray};
289
290 #[fixture]
291 fn binary_array() -> ArrayRef {
292 let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
293 let offsets = PrimitiveArray::from_iter([0, 11, 44]);
294
295 VarBinArray::try_new(
296 offsets.into_array(),
297 values,
298 DType::Utf8(Nullability::NonNullable),
299 Validity::NonNullable,
300 )
301 .unwrap()
302 .into_array()
303 }
304
305 #[rstest]
306 pub fn test_scalar_at(binary_array: ArrayRef) {
307 assert_eq!(binary_array.len(), 2);
308 assert_eq!(binary_array.scalar_at(0), "hello world".into());
309 assert_eq!(
310 binary_array.scalar_at(1),
311 "hello world this is a long string".into()
312 )
313 }
314
315 #[rstest]
316 pub fn slice_array(binary_array: ArrayRef) {
317 let binary_arr = binary_array.slice(1, 2);
318 assert_eq!(
319 binary_arr.scalar_at(0),
320 "hello world this is a long string".into()
321 );
322 }
323}