vortex_array/arrays/varbin/
mod.rs1use std::fmt::Debug;
2
3pub use compute::compute_min_max;
4use num_traits::PrimInt;
5use vortex_buffer::ByteBuffer;
6use vortex_dtype::{DType, NativePType, Nullability};
7use vortex_error::{
8 VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_bail, vortex_err, vortex_panic,
9};
10use vortex_mask::Mask;
11use vortex_scalar::Scalar;
12
13use crate::array::ArrayValidityImpl;
14use crate::arrays::varbin::builder::VarBinBuilder;
15use crate::arrays::varbin::serde::VarBinMetadata;
16use crate::compute::scalar_at;
17use crate::stats::{ArrayStats, StatsSetRef};
18use crate::validity::Validity;
19use crate::vtable::VTableRef;
20use crate::{
21 Array, ArrayImpl, ArrayRef, ArrayStatisticsImpl, Encoding, ProstMetadata, try_from_array_ref,
22};
23
24mod accessor;
25pub mod builder;
26mod canonical;
27mod compute;
28mod serde;
29mod variants;
30
31#[derive(Clone, Debug)]
32pub struct VarBinArray {
33 dtype: DType,
34 bytes: ByteBuffer,
35 offsets: ArrayRef,
36 validity: Validity,
37 stats_set: ArrayStats,
38}
39
40try_from_array_ref!(VarBinArray);
41
42#[derive(Debug)]
43pub struct VarBinEncoding;
44impl Encoding for VarBinEncoding {
45 type Array = VarBinArray;
46 type Metadata = ProstMetadata<VarBinMetadata>;
47}
48
49impl VarBinArray {
50 pub fn try_new(
51 offsets: ArrayRef,
52 bytes: ByteBuffer,
53 dtype: DType,
54 validity: Validity,
55 ) -> VortexResult<Self> {
56 if !offsets.dtype().is_int() || offsets.dtype().is_nullable() {
57 vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype());
58 }
59 if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) {
60 vortex_bail!(MismatchedTypes: "utf8 or binary", dtype);
61 }
62 if dtype.is_nullable() == (validity == Validity::NonNullable) {
63 vortex_bail!("incorrect validity {:?}", validity);
64 }
65
66 Ok(Self {
67 dtype,
68 bytes,
69 offsets,
70 validity,
71 stats_set: Default::default(),
72 })
73 }
74
75 #[inline]
76 pub fn offsets(&self) -> &ArrayRef {
77 &self.offsets
78 }
79
80 pub fn validity(&self) -> &Validity {
81 &self.validity
82 }
83
84 #[inline]
92 pub fn bytes(&self) -> &ByteBuffer {
93 &self.bytes
94 }
95
96 pub fn sliced_bytes(&self) -> ByteBuffer {
99 let first_offset: usize = self.offset_at(0).vortex_expect("1st offset");
100 let last_offset = self.offset_at(self.len()).vortex_expect("Last offset");
101
102 self.bytes().slice(first_offset..last_offset)
103 }
104
105 pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
106 let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
107 if size < u32::MAX as usize {
108 Self::from_vec_sized::<u32, T>(vec, dtype)
109 } else {
110 Self::from_vec_sized::<u64, T>(vec, dtype)
111 }
112 }
113
114 fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
115 where
116 O: NativePType + PrimInt,
117 T: AsRef<[u8]>,
118 {
119 let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
120 for v in vec {
121 builder.append_value(v.as_ref());
122 }
123 builder.finish(dtype)
124 }
125
126 #[allow(clippy::same_name_method)]
127 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
128 iter: I,
129 dtype: DType,
130 ) -> Self {
131 let iter = iter.into_iter();
132 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
133 for v in iter {
134 builder.append(v.as_ref().map(|o| o.as_ref()));
135 }
136 builder.finish(dtype)
137 }
138
139 pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
140 iter: I,
141 dtype: DType,
142 ) -> Self {
143 let iter = iter.into_iter();
144 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
145 for v in iter {
146 builder.append_value(v);
147 }
148 builder.finish(dtype)
149 }
150
151 pub fn offset_at(&self, index: usize) -> VortexResult<usize> {
155 if index > self.len() + 1 {
156 vortex_bail!(OutOfBounds: index, 0, self.len() + 1)
157 }
158
159 Ok(scalar_at(self.offsets(), index)
161 .unwrap_or_else(|err| vortex_panic!(err, "Failed to get offset at index: {}", index))
162 .as_ref()
163 .try_into()
164 .vortex_expect("Failed to convert offset to usize"))
165 }
166
167 pub fn bytes_at(&self, index: usize) -> VortexResult<ByteBuffer> {
171 let start = self.offset_at(index)?;
172 let end = self.offset_at(index + 1)?;
173
174 Ok(self.bytes().slice(start..end))
175 }
176
177 pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
180 (self.dtype, self.bytes, self.offsets, self.validity)
181 }
182}
183
184impl ArrayValidityImpl for VarBinArray {
185 fn _is_valid(&self, index: usize) -> VortexResult<bool> {
186 self.validity.is_valid(index)
187 }
188
189 fn _all_valid(&self) -> VortexResult<bool> {
190 self.validity.all_valid()
191 }
192
193 fn _all_invalid(&self) -> VortexResult<bool> {
194 self.validity.all_invalid()
195 }
196
197 fn _validity_mask(&self) -> VortexResult<Mask> {
198 self.validity.to_mask(self.len())
199 }
200}
201
202impl ArrayImpl for VarBinArray {
203 type Encoding = VarBinEncoding;
204
205 fn _len(&self) -> usize {
206 self.offsets().len().saturating_sub(1)
207 }
208
209 fn _dtype(&self) -> &DType {
210 &self.dtype
211 }
212
213 fn _vtable(&self) -> VTableRef {
214 VTableRef::new_ref(&VarBinEncoding)
215 }
216
217 fn _with_children(&self, children: &[ArrayRef]) -> VortexResult<Self> {
218 let new = match children.len() {
219 1 => {
221 let offsets = children[0].clone();
222 Self::try_new(
223 offsets,
224 self.bytes().clone(),
225 self.dtype().clone(),
226 self.validity().clone(),
227 )?
228 }
229 2 => {
231 let offsets = children[0].clone();
232 let validity_array = children[1].clone();
233 Self::try_new(
234 offsets,
235 self.bytes().clone(),
236 self.dtype().clone(),
237 Validity::Array(validity_array),
238 )?
239 }
240 _ => vortex_bail!("unexpected number of new children"),
241 };
242
243 Ok(new)
244 }
245}
246
247impl ArrayStatisticsImpl for VarBinArray {
248 fn _stats_ref(&self) -> StatsSetRef<'_> {
249 self.stats_set.to_ref(self)
250 }
251}
252
253impl From<Vec<&[u8]>> for VarBinArray {
254 fn from(value: Vec<&[u8]>) -> Self {
255 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
256 }
257}
258
259impl From<Vec<Vec<u8>>> for VarBinArray {
260 fn from(value: Vec<Vec<u8>>) -> Self {
261 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
262 }
263}
264
265impl From<Vec<String>> for VarBinArray {
266 fn from(value: Vec<String>) -> Self {
267 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
268 }
269}
270
271impl From<Vec<&str>> for VarBinArray {
272 fn from(value: Vec<&str>) -> Self {
273 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
274 }
275}
276
277impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
278 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
279 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
280 }
281}
282
283impl FromIterator<Option<Vec<u8>>> for VarBinArray {
284 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
285 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
286 }
287}
288
289impl FromIterator<Option<String>> for VarBinArray {
290 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
291 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
292 }
293}
294
295impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
296 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
297 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
298 }
299}
300
301pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar {
302 if matches!(dtype, DType::Utf8(_)) {
303 Scalar::try_utf8(value, dtype.nullability())
304 .map_err(|err| vortex_err!("Failed to create scalar from utf8 buffer: {}", err))
305 .vortex_unwrap()
306 } else {
307 Scalar::binary(value, dtype.nullability())
308 }
309}
310
311#[cfg(test)]
312mod test {
313 use rstest::{fixture, rstest};
314 use vortex_buffer::Buffer;
315 use vortex_dtype::{DType, Nullability};
316
317 use crate::ArrayRef;
318 use crate::array::Array;
319 use crate::arrays::primitive::PrimitiveArray;
320 use crate::arrays::varbin::VarBinArray;
321 use crate::compute::{scalar_at, slice};
322 use crate::validity::Validity;
323
324 #[fixture]
325 fn binary_array() -> ArrayRef {
326 let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes());
327 let offsets = PrimitiveArray::from_iter([0, 11, 44]);
328
329 VarBinArray::try_new(
330 offsets.into_array(),
331 values,
332 DType::Utf8(Nullability::NonNullable),
333 Validity::NonNullable,
334 )
335 .unwrap()
336 .into_array()
337 }
338
339 #[rstest]
340 pub fn test_scalar_at(binary_array: ArrayRef) {
341 assert_eq!(binary_array.len(), 2);
342 assert_eq!(scalar_at(&binary_array, 0).unwrap(), "hello world".into());
343 assert_eq!(
344 scalar_at(&binary_array, 1).unwrap(),
345 "hello world this is a long string".into()
346 )
347 }
348
349 #[rstest]
350 pub fn slice_array(binary_array: ArrayRef) {
351 let binary_arr = slice(&binary_array, 1, 2).unwrap();
352 assert_eq!(
353 scalar_at(&binary_arr, 0).unwrap(),
354 "hello world this is a long string".into()
355 );
356 }
357}