vortex_array/arrays/varbin/
array.rs1use num_traits::AsPrimitive;
5use vortex_buffer::ByteBuffer;
6use vortex_error::VortexExpect;
7use vortex_error::VortexResult;
8use vortex_error::vortex_ensure;
9use vortex_error::vortex_err;
10
11use crate::Array;
12use crate::ArrayRef;
13use crate::ToCanonical;
14use crate::arrays::varbin::builder::VarBinBuilder;
15use crate::buffer::BufferHandle;
16use crate::dtype::DType;
17use crate::dtype::IntegerPType;
18use crate::dtype::Nullability;
19use crate::match_each_integer_ptype;
20use crate::stats::ArrayStats;
21use crate::validity::Validity;
22
23#[derive(Clone, Debug)]
24pub struct VarBinArray {
25 pub(super) dtype: DType,
26 pub(super) bytes: BufferHandle,
27 pub(super) offsets: ArrayRef,
28 pub(super) validity: Validity,
29 pub(super) stats_set: ArrayStats,
30}
31
32impl VarBinArray {
33 pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
40 Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
41 }
42
43 pub fn new_from_handle(
50 offset: ArrayRef,
51 bytes: BufferHandle,
52 dtype: DType,
53 validity: Validity,
54 ) -> Self {
55 Self::try_new_from_handle(offset, bytes, dtype, validity).vortex_expect("VarBinArray new")
56 }
57
58 pub fn try_new(
67 offsets: ArrayRef,
68 bytes: ByteBuffer,
69 dtype: DType,
70 validity: Validity,
71 ) -> VortexResult<Self> {
72 let bytes = BufferHandle::new_host(bytes);
73 Self::validate(&offsets, &bytes, &dtype, &validity)?;
74
75 Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
77 }
78
79 pub fn try_new_from_handle(
89 offsets: ArrayRef,
90 bytes: BufferHandle,
91 dtype: DType,
92 validity: Validity,
93 ) -> VortexResult<Self> {
94 Self::validate(&offsets, &bytes, &dtype, &validity)?;
95
96 Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
98 }
99
100 pub unsafe fn new_unchecked(
129 offsets: ArrayRef,
130 bytes: ByteBuffer,
131 dtype: DType,
132 validity: Validity,
133 ) -> Self {
134 unsafe {
137 Self::new_unchecked_from_handle(offsets, BufferHandle::new_host(bytes), dtype, validity)
138 }
139 }
140
141 pub unsafe fn new_unchecked_from_handle(
148 offsets: ArrayRef,
149 bytes: BufferHandle,
150 dtype: DType,
151 validity: Validity,
152 ) -> Self {
153 #[cfg(debug_assertions)]
154 Self::validate(&offsets, &bytes, &dtype, &validity)
155 .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
156
157 Self {
158 dtype,
159 bytes,
160 offsets,
161 validity,
162 stats_set: Default::default(),
163 }
164 }
165
166 pub fn validate(
170 offsets: &ArrayRef,
171 bytes: &BufferHandle,
172 dtype: &DType,
173 validity: &Validity,
174 ) -> VortexResult<()> {
175 vortex_ensure!(
177 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
178 MismatchedTypes: "non nullable int", offsets.dtype()
179 );
180
181 vortex_ensure!(
183 matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
184 MismatchedTypes: "utf8 or binary", dtype
185 );
186
187 vortex_ensure!(
189 dtype.is_nullable() != (validity == &Validity::NonNullable),
190 InvalidArgument: "incorrect validity {:?} for dtype {}",
191 validity,
192 dtype
193 );
194
195 vortex_ensure!(
197 !offsets.is_empty(),
198 InvalidArgument: "Offsets must have at least one element"
199 );
200
201 if offsets.is_host() && bytes.is_on_host() {
203 let last_offset = offsets
204 .scalar_at(offsets.len() - 1)?
205 .as_primitive()
206 .as_::<usize>()
207 .ok_or_else(
208 || vortex_err!(InvalidArgument: "Last offset must be convertible to usize"),
209 )?;
210 vortex_ensure!(
211 last_offset <= bytes.len(),
212 InvalidArgument: "Last offset {} exceeds bytes length {}",
213 last_offset,
214 bytes.len()
215 );
216 }
217
218 if let Some(validity_len) = validity.maybe_len() {
220 vortex_ensure!(
221 validity_len == offsets.len() - 1,
222 "Validity length {} doesn't match array length {}",
223 validity_len,
224 offsets.len() - 1
225 );
226 }
227
228 if offsets.is_host()
230 && bytes.is_on_host()
231 && matches!(dtype, DType::Utf8(_))
232 && let Some(bytes) = bytes.as_host_opt()
233 {
234 let primitive_offsets = offsets.to_primitive();
235 match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
236 let offsets_slice = primitive_offsets.as_slice::<O>();
237 for (i, (start, end)) in offsets_slice
238 .windows(2)
239 .map(|o| (o[0].as_(), o[1].as_()))
240 .enumerate()
241 {
242 if validity.is_null(i)? {
243 continue;
244 }
245
246 let string_bytes = &bytes.as_ref()[start..end];
247 simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
248 #[allow(clippy::unwrap_used)]
249 let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
251 vortex_err!("invalid utf-8: {err} at index {i}")
252 })?;
253 }
254 });
255 }
256
257 Ok(())
258 }
259
260 #[inline]
261 pub fn offsets(&self) -> &ArrayRef {
262 &self.offsets
263 }
264
265 #[inline]
273 pub fn bytes(&self) -> &ByteBuffer {
274 self.bytes.as_host()
275 }
276
277 #[inline]
279 pub fn bytes_handle(&self) -> &BufferHandle {
280 &self.bytes
281 }
282
283 pub fn sliced_bytes(&self) -> ByteBuffer {
286 let first_offset: usize = self.offset_at(0);
287 let last_offset = self.offset_at(self.len());
288
289 self.bytes().slice(first_offset..last_offset)
290 }
291
292 pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
293 let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
294 if size < u32::MAX as usize {
295 Self::from_vec_sized::<u32, T>(vec, dtype)
296 } else {
297 Self::from_vec_sized::<u64, T>(vec, dtype)
298 }
299 }
300
301 fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
302 where
303 O: IntegerPType,
304 T: AsRef<[u8]>,
305 {
306 let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
307 for v in vec {
308 builder.append_value(v.as_ref());
309 }
310 builder.finish(dtype)
311 }
312
313 #[expect(
314 clippy::same_name_method,
315 reason = "intentionally named from_iter like Iterator::from_iter"
316 )]
317 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
318 iter: I,
319 dtype: DType,
320 ) -> Self {
321 let iter = iter.into_iter();
322 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
323 for v in iter {
324 builder.append(v.as_ref().map(|o| o.as_ref()));
325 }
326 builder.finish(dtype)
327 }
328
329 pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
330 iter: I,
331 dtype: DType,
332 ) -> Self {
333 let iter = iter.into_iter();
334 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
335 for v in iter {
336 builder.append_value(v);
337 }
338 builder.finish(dtype)
339 }
340
341 pub fn offset_at(&self, index: usize) -> usize {
347 assert!(
348 index <= self.len(),
349 "Index {index} out of bounds 0..={}",
350 self.len()
351 );
352
353 (&self
354 .offsets()
355 .scalar_at(index)
356 .vortex_expect("offsets must support scalar_at"))
357 .try_into()
358 .vortex_expect("Failed to convert offset to usize")
359 }
360
361 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
365 let start = self.offset_at(index);
366 let end = self.offset_at(index + 1);
367
368 self.bytes().slice(start..end)
369 }
370
371 pub fn into_parts(self) -> (DType, BufferHandle, ArrayRef, Validity) {
374 (self.dtype, self.bytes, self.offsets, self.validity)
375 }
376}
377
378impl From<Vec<&[u8]>> for VarBinArray {
379 fn from(value: Vec<&[u8]>) -> Self {
380 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
381 }
382}
383
384impl From<Vec<Vec<u8>>> for VarBinArray {
385 fn from(value: Vec<Vec<u8>>) -> Self {
386 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
387 }
388}
389
390impl From<Vec<String>> for VarBinArray {
391 fn from(value: Vec<String>) -> Self {
392 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
393 }
394}
395
396impl From<Vec<&str>> for VarBinArray {
397 fn from(value: Vec<&str>) -> Self {
398 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
399 }
400}
401
402impl From<Vec<Option<&[u8]>>> for VarBinArray {
403 fn from(value: Vec<Option<&[u8]>>) -> Self {
404 Self::from_iter(value, DType::Binary(Nullability::Nullable))
405 }
406}
407
408impl From<Vec<Option<Vec<u8>>>> for VarBinArray {
409 fn from(value: Vec<Option<Vec<u8>>>) -> Self {
410 Self::from_iter(value, DType::Binary(Nullability::Nullable))
411 }
412}
413
414impl From<Vec<Option<String>>> for VarBinArray {
415 fn from(value: Vec<Option<String>>) -> Self {
416 Self::from_iter(value, DType::Utf8(Nullability::Nullable))
417 }
418}
419
420impl From<Vec<Option<&str>>> for VarBinArray {
421 fn from(value: Vec<Option<&str>>) -> Self {
422 Self::from_iter(value, DType::Utf8(Nullability::Nullable))
423 }
424}
425
426impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
427 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
428 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
429 }
430}
431
432impl FromIterator<Option<Vec<u8>>> for VarBinArray {
433 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
434 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
435 }
436}
437
438impl FromIterator<Option<String>> for VarBinArray {
439 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
440 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
441 }
442}
443
444impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
445 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
446 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
447 }
448}