vortex_array/arrays/varbinview/
array.rs1use std::sync::Arc;
5
6use vortex_buffer::Alignment;
7use vortex_buffer::Buffer;
8use vortex_buffer::ByteBuffer;
9use vortex_dtype::DType;
10use vortex_dtype::Nullability;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_error::vortex_ensure;
15use vortex_error::vortex_err;
16use vortex_error::vortex_panic;
17
18use crate::arrays::BinaryView;
19use crate::buffer::BufferHandle;
20use crate::builders::ArrayBuilder;
21use crate::builders::VarBinViewBuilder;
22use crate::stats::ArrayStats;
23use crate::validity::Validity;
24
25#[derive(Clone, Debug)]
85pub struct VarBinViewArray {
86 pub(super) dtype: DType,
87 pub(super) buffers: Arc<[BufferHandle]>,
88 pub(super) views: BufferHandle,
89 pub(super) validity: Validity,
90 pub(super) stats_set: ArrayStats,
91}
92
93pub struct VarBinViewArrayParts {
94 pub dtype: DType,
95 pub buffers: Arc<[BufferHandle]>,
96 pub views: BufferHandle,
97 pub validity: Validity,
98}
99
100impl VarBinViewArray {
101 pub fn new(
108 views: Buffer<BinaryView>,
109 buffers: Arc<[ByteBuffer]>,
110 dtype: DType,
111 validity: Validity,
112 ) -> Self {
113 Self::try_new(views, buffers, dtype, validity)
114 .vortex_expect("VarBinViewArray construction failed")
115 }
116
117 pub fn new_handle(
124 views: BufferHandle,
125 buffers: Arc<[BufferHandle]>,
126 dtype: DType,
127 validity: Validity,
128 ) -> Self {
129 Self::try_new_handle(views, buffers, dtype, validity)
130 .vortex_expect("VarbinViewArray construction failed")
131 }
132
133 pub fn try_new(
142 views: Buffer<BinaryView>,
143 buffers: Arc<[ByteBuffer]>,
144 dtype: DType,
145 validity: Validity,
146 ) -> VortexResult<Self> {
147 Self::validate(&views, &buffers, &dtype, &validity)?;
148
149 Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) })
151 }
152
153 pub fn try_new_handle(
162 views: BufferHandle,
163 buffers: Arc<[BufferHandle]>,
164 dtype: DType,
165 validity: Validity,
166 ) -> VortexResult<Self> {
167 let views_nbytes = views.len();
168 vortex_ensure!(
169 views_nbytes.is_multiple_of(size_of::<BinaryView>()),
170 "Expected views buffer length ({views_nbytes}) to be a multiple of {}",
171 size_of::<BinaryView>()
172 );
173
174 if let Some(host) = views.as_host_opt() {
176 vortex_ensure!(
177 host.is_aligned(Alignment::of::<BinaryView>()),
178 "Views on host must be 16 byte aligned"
179 );
180 }
181
182 Ok(unsafe { Self::new_handle_unchecked(views, buffers, dtype, validity) })
184 }
185
186 pub unsafe fn new_unchecked(
216 views: Buffer<BinaryView>,
217 buffers: Arc<[ByteBuffer]>,
218 dtype: DType,
219 validity: Validity,
220 ) -> Self {
221 #[cfg(debug_assertions)]
222 Self::validate(&views, &buffers, &dtype, &validity)
223 .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters");
224
225 let handles: Vec<BufferHandle> = buffers
226 .iter()
227 .cloned()
228 .map(BufferHandle::new_host)
229 .collect();
230
231 let handles = Arc::from(handles);
232 let view_handle = BufferHandle::new_host(views.into_byte_buffer());
233 unsafe { Self::new_handle_unchecked(view_handle, handles, dtype, validity) }
234 }
235
236 pub unsafe fn new_handle_unchecked(
242 views: BufferHandle,
243 buffers: Arc<[BufferHandle]>,
244 dtype: DType,
245 validity: Validity,
246 ) -> Self {
247 Self {
248 views,
249 buffers,
250 dtype,
251 validity,
252 stats_set: Default::default(),
253 }
254 }
255
256 pub fn validate(
260 views: &Buffer<BinaryView>,
261 buffers: &Arc<[ByteBuffer]>,
262 dtype: &DType,
263 validity: &Validity,
264 ) -> VortexResult<()> {
265 vortex_ensure!(
266 validity.nullability() == dtype.nullability(),
267 InvalidArgument: "validity {:?} incompatible with nullability {:?}",
268 validity,
269 dtype.nullability()
270 );
271
272 match dtype {
273 DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
274 simdutf8::basic::from_utf8(string).is_ok()
275 })?,
276 DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
277 _ => vortex_bail!(InvalidArgument: "invalid DType {dtype} for `VarBinViewArray`"),
278 }
279
280 Ok(())
281 }
282
283 fn validate_views<F>(
284 views: &Buffer<BinaryView>,
285 buffers: &Arc<[ByteBuffer]>,
286 validity: &Validity,
287 validator: F,
288 ) -> VortexResult<()>
289 where
290 F: Fn(&[u8]) -> bool,
291 {
292 for (idx, &view) in views.iter().enumerate() {
293 if validity.is_null(idx)? {
294 continue;
295 }
296
297 if view.is_inlined() {
298 let bytes = &view.as_inlined().data[..view.len() as usize];
300 vortex_ensure!(
301 validator(bytes),
302 InvalidArgument: "view at index {idx}: inlined bytes failed utf-8 validation"
303 );
304 } else {
305 let view = view.as_view();
307 let buf_index = view.buffer_index as usize;
308 let start_offset = view.offset as usize;
309 let end_offset = start_offset.saturating_add(view.size as usize);
310
311 let buf = buffers.get(buf_index).ok_or_else(||
312 vortex_err!(InvalidArgument: "view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers",
313 buffers.len()))?;
314
315 vortex_ensure!(
316 start_offset < buf.len(),
317 InvalidArgument: "start offset {start_offset} out of bounds for buffer {buf_index} with size {}",
318 buf.len(),
319 );
320
321 vortex_ensure!(
322 end_offset <= buf.len(),
323 InvalidArgument: "end offset {end_offset} out of bounds for buffer {buf_index} with size {}",
324 buf.len(),
325 );
326
327 let bytes = &buf[start_offset..end_offset];
329 vortex_ensure!(
330 view.prefix == bytes[..4],
331 InvalidArgument: "VarBinView prefix does not match full string"
332 );
333
334 vortex_ensure!(
336 validator(bytes),
337 InvalidArgument: "view at index {idx}: outlined bytes fails utf-8 validation"
338 );
339 }
340 }
341
342 Ok(())
343 }
344
345 pub fn into_parts(self) -> VarBinViewArrayParts {
347 VarBinViewArrayParts {
348 dtype: self.dtype,
349 buffers: self.buffers,
350 views: self.views,
351 validity: self.validity,
352 }
353 }
354
355 pub fn nbuffers(&self) -> usize {
357 self.buffers.len()
358 }
359
360 #[inline]
366 pub fn views(&self) -> &[BinaryView] {
367 let host_views = self.views.as_host();
368 let len = host_views.len() / size_of::<BinaryView>();
369
370 unsafe { std::slice::from_raw_parts(host_views.as_ptr().cast(), len) }
372 }
373
374 pub fn views_handle(&self) -> &BufferHandle {
376 &self.views
377 }
378
379 #[inline]
383 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
384 let views = self.views();
385 let view = &views[index];
386 if !view.is_inlined() {
388 let view_ref = view.as_view();
389 self.buffer(view_ref.buffer_index as usize)
390 .slice(view_ref.as_range())
391 } else {
392 self.views_handle()
394 .as_host()
395 .clone()
396 .into_byte_buffer()
397 .slice_ref(view.as_inlined().value())
398 }
399 }
400
401 #[inline]
408 pub fn buffer(&self, idx: usize) -> &ByteBuffer {
409 if idx >= self.nbuffers() {
410 vortex_panic!(
411 "{idx} buffer index out of bounds, there are {} buffers",
412 self.nbuffers()
413 );
414 }
415 self.buffers[idx].as_host()
416 }
417
418 #[inline]
420 pub fn buffers(&self) -> &Arc<[BufferHandle]> {
421 &self.buffers
422 }
423
424 #[expect(
426 clippy::same_name_method,
427 reason = "intentionally named from_iter like Iterator::from_iter"
428 )]
429 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
430 iter: I,
431 dtype: DType,
432 ) -> Self {
433 let iter = iter.into_iter();
434 let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0);
435
436 for item in iter {
437 match item {
438 None => builder.append_null(),
439 Some(v) => builder.append_value(v),
440 }
441 }
442
443 builder.finish_into_varbinview()
444 }
445
446 pub fn from_iter_str<T: AsRef<str>, I: IntoIterator<Item = T>>(iter: I) -> Self {
447 let iter = iter.into_iter();
448 let mut builder = VarBinViewBuilder::with_capacity(
449 DType::Utf8(Nullability::NonNullable),
450 iter.size_hint().0,
451 );
452
453 for item in iter {
454 builder.append_value(item.as_ref());
455 }
456
457 builder.finish_into_varbinview()
458 }
459
460 pub fn from_iter_nullable_str<T: AsRef<str>, I: IntoIterator<Item = Option<T>>>(
461 iter: I,
462 ) -> Self {
463 let iter = iter.into_iter();
464 let mut builder = VarBinViewBuilder::with_capacity(
465 DType::Utf8(Nullability::Nullable),
466 iter.size_hint().0,
467 );
468
469 for item in iter {
470 match item {
471 None => builder.append_null(),
472 Some(v) => builder.append_value(v.as_ref()),
473 }
474 }
475
476 builder.finish_into_varbinview()
477 }
478
479 pub fn from_iter_bin<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(iter: I) -> Self {
480 let iter = iter.into_iter();
481 let mut builder = VarBinViewBuilder::with_capacity(
482 DType::Binary(Nullability::NonNullable),
483 iter.size_hint().0,
484 );
485
486 for item in iter {
487 builder.append_value(item.as_ref());
488 }
489
490 builder.finish_into_varbinview()
491 }
492
493 pub fn from_iter_nullable_bin<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
494 iter: I,
495 ) -> Self {
496 let iter = iter.into_iter();
497 let mut builder = VarBinViewBuilder::with_capacity(
498 DType::Binary(Nullability::Nullable),
499 iter.size_hint().0,
500 );
501
502 for item in iter {
503 match item {
504 None => builder.append_null(),
505 Some(v) => builder.append_value(v.as_ref()),
506 }
507 }
508
509 builder.finish_into_varbinview()
510 }
511}
512
513impl<'a> FromIterator<Option<&'a [u8]>> for VarBinViewArray {
514 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
515 Self::from_iter_nullable_bin(iter)
516 }
517}
518
519impl FromIterator<Option<Vec<u8>>> for VarBinViewArray {
520 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
521 Self::from_iter_nullable_bin(iter)
522 }
523}
524
525impl FromIterator<Option<String>> for VarBinViewArray {
526 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
527 Self::from_iter_nullable_str(iter)
528 }
529}
530
531impl<'a> FromIterator<Option<&'a str>> for VarBinViewArray {
532 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
533 Self::from_iter_nullable_str(iter)
534 }
535}