vortex_array/arrays/varbin/
mod.rs1use std::fmt::Debug;
5
6pub(crate) use compute::compute_min_max;
7use num_traits::PrimInt;
8use vortex_buffer::ByteBuffer;
9use vortex_dtype::{DType, NativePType, Nullability};
10use vortex_error::{VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err};
11use vortex_scalar::Scalar;
12
13use crate::arrays::varbin::builder::VarBinBuilder;
14use crate::stats::{ArrayStats, StatsSetRef};
15use crate::validity::Validity;
16use crate::vtable::{
17 ArrayVTable, NotSupported, VTable, ValidityHelper, ValidityVTableFromValidityHelper,
18};
19use crate::{Array, ArrayRef, EncodingId, EncodingRef, vtable};
20
21mod accessor;
22pub mod builder;
23mod canonical;
24mod compute;
25mod ops;
26mod serde;
27
28vtable!(VarBin);
29
30impl VTable for VarBinVTable {
31 type Array = VarBinArray;
32 type Encoding = VarBinEncoding;
33 type ArrayVTable = Self;
34 type CanonicalVTable = Self;
35 type OperationsVTable = Self;
36 type ValidityVTable = ValidityVTableFromValidityHelper;
37 type VisitorVTable = Self;
38 type ComputeVTable = NotSupported;
39 type EncodeVTable = NotSupported;
40 type PipelineVTable = NotSupported;
41 type SerdeVTable = Self;
42
43 fn id(_encoding: &Self::Encoding) -> EncodingId {
44 EncodingId::new_ref("vortex.varbin")
45 }
46
47 fn encoding(_array: &Self::Array) -> EncodingRef {
48 EncodingRef::new_ref(VarBinEncoding.as_ref())
49 }
50}
51
52#[derive(Clone, Debug)]
53pub struct VarBinArray {
54 dtype: DType,
55 bytes: ByteBuffer,
56 offsets: ArrayRef,
57 validity: Validity,
58 stats_set: ArrayStats,
59}
60
61#[derive(Clone, Debug)]
62pub struct VarBinEncoding;
63
64impl VarBinArray {
65 pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
66 Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
67 }
68
69 pub fn try_new(
70 offsets: ArrayRef,
71 bytes: ByteBuffer,
72 dtype: DType,
73 validity: Validity,
74 ) -> VortexResult<Self> {
75 if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
76 vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
77 }
78 if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
79 vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
80 }
81 if dtype.is_nullable() == (validity == Validity::NonNullable) {
82 vortex_bail!("incorrect validity {:?}", validity);
83 }
84
85 Ok(Self {
86 dtype,
87 bytes,
88 offsets,
89 validity,
90 stats_set: Default::default(),
91 })
92 }
93
94 #[inline]
95 pub fn offsets(&self) -> &ArrayRef {
96 &self.offsets
97 }
98
99 #[inline]
107 pub fn bytes(&self) -> &ByteBuffer {
108 &self.bytes
109 }
110
111 pub fn sliced_bytes(&self) -> ByteBuffer {
114 let first_offset: usize = self.offset_at(0);
115 let last_offset = self.offset_at(self.len());
116
117 self.bytes().slice(first_offset..last_offset)
118 }
119
120 pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
121 let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
122 if size < u32::MAX as usize {
123 Self::from_vec_sized::<u32, T>(vec, dtype)
124 } else {
125 Self::from_vec_sized::<u64, T>(vec, dtype)
126 }
127 }
128
129 fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
130 where
131 O: NativePType + PrimInt,
132 T: AsRef<[u8]>,
133 {
134 let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
135 for v in vec {
136 builder.append_value(v.as_ref());
137 }
138 builder.finish(dtype)
139 }
140
141 #[allow(clippy::same_name_method)]
142 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
143 iter: I,
144 dtype: DType,
145 ) -> Self {
146 let iter = iter.into_iter();
147 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
148 for v in iter {
149 builder.append(v.as_ref().map(|o| o.as_ref()));
150 }
151 builder.finish(dtype)
152 }
153
154 pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
155 iter: I,
156 dtype: DType,
157 ) -> Self {
158 let iter = iter.into_iter();
159 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
160 for v in iter {
161 builder.append_value(v);
162 }
163 builder.finish(dtype)
164 }
165
166 pub fn offset_at(&self, index: usize) -> usize {
172 assert!(
173 index <= self.len(),
174 "Index {index} out of bounds 0..={}",
175 self.len()
176 );
177
178 self.offsets()
179 .scalar_at(index)
180 .as_ref()
181 .try_into()
182 .vortex_expect("Failed to convert offset to usize")
183 }
184
185 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
189 let start = self.offset_at(index);
190 let end = self.offset_at(index + 1);
191
192 self.bytes().slice(start..end)
193 }
194
195 pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
198 (self.dtype, self.bytes, self.offsets, self.validity)
199 }
200}
201
202impl ValidityHelper for VarBinArray {
203 fn validity(&self) -> &Validity {
204 &self.validity
205 }
206}
207
208impl ArrayVTable<VarBinVTable> for VarBinVTable {
209 fn len(array: &VarBinArray) -> usize {
210 array.offsets().len().saturating_sub(1)
211 }
212
213 fn dtype(array: &VarBinArray) -> &DType {
214 &array.dtype
215 }
216
217 fn stats(array: &VarBinArray) -> StatsSetRef<'_> {
218 array.stats_set.to_ref(array.as_ref())
219 }
220}
221
222impl From<Vec<&[u8]>> for VarBinArray {
223 fn from(value: Vec<&[u8]>) -> Self {
224 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
225 }
226}
227
228impl From<Vec<Vec<u8>>> for VarBinArray {
229 fn from(value: Vec<Vec<u8>>) -> Self {
230 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
231 }
232}
233
234impl From<Vec<String>> for VarBinArray {
235 fn from(value: Vec<String>) -> Self {
236 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
237 }
238}
239
240impl From<Vec<&str>> for VarBinArray {
241 fn from(value: Vec<&str>) -> Self {
242 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
243 }
244}
245
246impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
247 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
248 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
249 }
250}
251
252impl FromIterator<Option<Vec<u8>>> for VarBinArray {
253 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
254 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
255 }
256}
257
258impl FromIterator<Option<String>> for VarBinArray {
259 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
260 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
261 }
262}
263
264impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
265 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
266 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
267 }
268}
269
270pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
271 if matches!(dtype, DType::Utf8(_)) {
272 Scalar::try_utf8(value, dtype.nullability())
273 .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
274 .vortex_unwrap()
275 } else {
276 Scalar::binary(value, dtype.nullability())
277 }
278}
279
280#[cfg(test)]
281mod test {
282 use rstest::{fixture, rstest};
283 use vortex_buffer::Buffer;
284 use vortex_dtype::{DType, Nullability};
285
286 use crate::arrays::primitive::PrimitiveArray;
287 use crate::arrays::varbin::VarBinArray;
288 use crate::validity::Validity;
289 use crate::{Array, ArrayRef, IntoArray};
290
291 #[fixture]
292 fn binary_array() -> ArrayRef {
293 let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
294 let offsets = PrimitiveArray::from_iter([0, 11, 44]);
295
296 VarBinArray::try_new(
297 offsets.into_array(),
298 values,
299 DType::Utf8(Nullability::NonNullable),
300 Validity::NonNullable,
301 )
302 .unwrap()
303 .into_array()
304 }
305
306 #[rstest]
307 pub fn test_scalar_at(binary_array: ArrayRef) {
308 assert_eq!(binary_array.len(), 2);
309 assert_eq!(binary_array.scalar_at(0), "hello world".into());
310 assert_eq!(
311 binary_array.scalar_at(1),
312 "hello world this is a long string".into()
313 )
314 }
315
316 #[rstest]
317 pub fn slice_array(binary_array: ArrayRef) {
318 let binary_arr = binary_array.slice(1, 2);
319 assert_eq!(
320 binary_arr.scalar_at(0),
321 "hello world this is a long string".into()
322 );
323 }
324}