vortex_array/arrays/varbin/
array.rs1use num_traits::AsPrimitive;
5use vortex_buffer::Buffer;
6use vortex_buffer::ByteBuffer;
7use vortex_dtype::DType;
8use vortex_dtype::IntegerPType;
9use vortex_dtype::Nullability;
10use vortex_dtype::match_each_integer_ptype;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_err;
15
16use crate::Array;
17use crate::ArrayRef;
18use crate::IntoArray;
19use crate::ToCanonical;
20use crate::arrays::varbin::builder::VarBinBuilder;
21use crate::buffer::BufferHandle;
22use crate::stats::ArrayStats;
23use crate::validity::Validity;
24
25#[derive(Clone, Debug)]
26pub struct VarBinArray {
27 pub(super) dtype: DType,
28 pub(super) bytes: BufferHandle,
29 pub(super) offsets: ArrayRef,
30 pub(super) validity: Validity,
31 pub(super) stats_set: ArrayStats,
32}
33
34impl VarBinArray {
35 pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
42 Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
43 }
44
45 pub fn new_from_handle(
52 offset: ArrayRef,
53 bytes: BufferHandle,
54 dtype: DType,
55 validity: Validity,
56 ) -> Self {
57 Self::try_new_from_handle(offset, bytes, dtype, validity).vortex_expect("VarBinArray new")
58 }
59
60 pub fn try_new(
69 offsets: ArrayRef,
70 bytes: ByteBuffer,
71 dtype: DType,
72 validity: Validity,
73 ) -> VortexResult<Self> {
74 let bytes = BufferHandle::new_host(bytes);
75 Self::validate(&offsets, &bytes, &dtype, &validity)?;
76
77 Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
79 }
80
81 pub fn try_new_from_handle(
91 offsets: ArrayRef,
92 bytes: BufferHandle,
93 dtype: DType,
94 validity: Validity,
95 ) -> VortexResult<Self> {
96 Self::validate(&offsets, &bytes, &dtype, &validity)?;
97
98 Ok(unsafe { Self::new_unchecked_from_handle(offsets, bytes, dtype, validity) })
100 }
101
102 pub unsafe fn new_unchecked(
131 offsets: ArrayRef,
132 bytes: ByteBuffer,
133 dtype: DType,
134 validity: Validity,
135 ) -> Self {
136 unsafe {
139 Self::new_unchecked_from_handle(offsets, BufferHandle::new_host(bytes), dtype, validity)
140 }
141 }
142
143 pub unsafe fn new_unchecked_from_handle(
150 offsets: ArrayRef,
151 bytes: BufferHandle,
152 dtype: DType,
153 validity: Validity,
154 ) -> Self {
155 #[cfg(debug_assertions)]
156 Self::validate(&offsets, &bytes, &dtype, &validity)
157 .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
158
159 Self {
160 dtype,
161 bytes,
162 offsets,
163 validity,
164 stats_set: Default::default(),
165 }
166 }
167
168 pub fn validate(
172 offsets: &dyn Array,
173 bytes: &BufferHandle,
174 dtype: &DType,
175 validity: &Validity,
176 ) -> VortexResult<()> {
177 vortex_ensure!(
179 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
180 MismatchedTypes: "non nullable int", offsets.dtype()
181 );
182
183 vortex_ensure!(
185 matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
186 MismatchedTypes: "utf8 or binary", dtype
187 );
188
189 vortex_ensure!(
191 dtype.is_nullable() != (validity == &Validity::NonNullable),
192 InvalidArgument: "incorrect validity {:?} for dtype {}",
193 validity,
194 dtype
195 );
196
197 vortex_ensure!(
199 !offsets.is_empty(),
200 InvalidArgument: "Offsets must have at least one element"
201 );
202
203 if offsets.is_host() && bytes.is_on_host() {
205 let last_offset = offsets
206 .scalar_at(offsets.len() - 1)?
207 .as_primitive()
208 .as_::<usize>()
209 .ok_or_else(
210 || vortex_err!(InvalidArgument: "Last offset must be convertible to usize"),
211 )?;
212 vortex_ensure!(
213 last_offset <= bytes.len(),
214 InvalidArgument: "Last offset {} exceeds bytes length {}",
215 last_offset,
216 bytes.len()
217 );
218 }
219
220 if let Some(validity_len) = validity.maybe_len() {
222 vortex_ensure!(
223 validity_len == offsets.len() - 1,
224 "Validity length {} doesn't match array length {}",
225 validity_len,
226 offsets.len() - 1
227 );
228 }
229
230 if offsets.is_host()
232 && bytes.is_on_host()
233 && matches!(dtype, DType::Utf8(_))
234 && let Some(bytes) = bytes.as_host_opt()
235 {
236 let primitive_offsets = offsets.to_primitive();
237 match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
238 let offsets_slice = primitive_offsets.as_slice::<O>();
239 for (i, (start, end)) in offsets_slice
240 .windows(2)
241 .map(|o| (o[0].as_(), o[1].as_()))
242 .enumerate()
243 {
244 if validity.is_null(i)? {
245 continue;
246 }
247
248 let string_bytes = &bytes.as_ref()[start..end];
249 simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
250 #[allow(clippy::unwrap_used)]
251 let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
253 vortex_err!("invalid utf-8: {err} at index {i}")
254 })?;
255 }
256 });
257 }
258
259 Ok(())
260 }
261
262 #[inline]
263 pub fn offsets(&self) -> &ArrayRef {
264 &self.offsets
265 }
266
267 #[inline]
275 pub fn bytes(&self) -> &ByteBuffer {
276 self.bytes.as_host()
277 }
278
279 #[inline]
281 pub fn bytes_handle(&self) -> &BufferHandle {
282 &self.bytes
283 }
284
285 pub fn sliced_bytes(&self) -> ByteBuffer {
288 let first_offset: usize = self.offset_at(0);
289 let last_offset = self.offset_at(self.len());
290
291 self.bytes().slice(first_offset..last_offset)
292 }
293
294 pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
295 let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
296 if size < u32::MAX as usize {
297 Self::from_vec_sized::<u32, T>(vec, dtype)
298 } else {
299 Self::from_vec_sized::<u64, T>(vec, dtype)
300 }
301 }
302
303 fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
304 where
305 O: IntegerPType,
306 T: AsRef<[u8]>,
307 {
308 let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
309 for v in vec {
310 builder.append_value(v.as_ref());
311 }
312 builder.finish(dtype)
313 }
314
315 #[expect(
316 clippy::same_name_method,
317 reason = "intentionally named from_iter like Iterator::from_iter"
318 )]
319 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
320 iter: I,
321 dtype: DType,
322 ) -> Self {
323 let iter = iter.into_iter();
324 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
325 for v in iter {
326 builder.append(v.as_ref().map(|o| o.as_ref()));
327 }
328 builder.finish(dtype)
329 }
330
331 pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
332 iter: I,
333 dtype: DType,
334 ) -> Self {
335 let iter = iter.into_iter();
336 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
337 for v in iter {
338 builder.append_value(v);
339 }
340 builder.finish(dtype)
341 }
342
343 pub fn offset_at(&self, index: usize) -> usize {
349 assert!(
350 index <= self.len(),
351 "Index {index} out of bounds 0..={}",
352 self.len()
353 );
354
355 (&self
356 .offsets()
357 .scalar_at(index)
358 .vortex_expect("offsets must support scalar_at"))
359 .try_into()
360 .vortex_expect("Failed to convert offset to usize")
361 }
362
363 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
367 let start = self.offset_at(index);
368 let end = self.offset_at(index + 1);
369
370 self.bytes().slice(start..end)
371 }
372
373 pub fn into_parts(self) -> (DType, BufferHandle, ArrayRef, Validity) {
376 (self.dtype, self.bytes, self.offsets, self.validity)
377 }
378}
379
380impl VarBinArray {
381 #[doc(hidden)]
384 pub fn zero_offsets(self) -> Self {
385 if self.is_empty() {
386 return self;
387 }
388
389 let first = self.offset_at(0);
390
391 let bytes = self.sliced_bytes();
392 let dtype = self.dtype;
393 let validity = self.validity;
394 let offsets = self.offsets;
395
396 let offsets = if first == 0 {
397 offsets
398 } else {
399 let offsets = offsets.to_primitive();
400 match_each_integer_ptype!(offsets.ptype(), |P| {
401 let offsets = offsets.as_slice::<P>();
402 let buffer: Buffer<P> = offsets.iter().map(|index| index - offsets[0]).collect();
403 buffer.into_array()
404 })
405 };
406
407 unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) }
410 }
411}
412
413impl From<Vec<&[u8]>> for VarBinArray {
414 fn from(value: Vec<&[u8]>) -> Self {
415 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
416 }
417}
418
419impl From<Vec<Vec<u8>>> for VarBinArray {
420 fn from(value: Vec<Vec<u8>>) -> Self {
421 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
422 }
423}
424
425impl From<Vec<String>> for VarBinArray {
426 fn from(value: Vec<String>) -> Self {
427 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
428 }
429}
430
431impl From<Vec<&str>> for VarBinArray {
432 fn from(value: Vec<&str>) -> Self {
433 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
434 }
435}
436
437impl From<Vec<Option<&[u8]>>> for VarBinArray {
438 fn from(value: Vec<Option<&[u8]>>) -> Self {
439 Self::from_iter(value, DType::Binary(Nullability::Nullable))
440 }
441}
442
443impl From<Vec<Option<Vec<u8>>>> for VarBinArray {
444 fn from(value: Vec<Option<Vec<u8>>>) -> Self {
445 Self::from_iter(value, DType::Binary(Nullability::Nullable))
446 }
447}
448
449impl From<Vec<Option<String>>> for VarBinArray {
450 fn from(value: Vec<Option<String>>) -> Self {
451 Self::from_iter(value, DType::Utf8(Nullability::Nullable))
452 }
453}
454
455impl From<Vec<Option<&str>>> for VarBinArray {
456 fn from(value: Vec<Option<&str>>) -> Self {
457 Self::from_iter(value, DType::Utf8(Nullability::Nullable))
458 }
459}
460
461impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
462 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
463 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
464 }
465}
466
467impl FromIterator<Option<Vec<u8>>> for VarBinArray {
468 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
469 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
470 }
471}
472
473impl FromIterator<Option<String>> for VarBinArray {
474 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
475 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
476 }
477}
478
479impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
480 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
481 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
482 }
483}