vortex_array/arrays/listview/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::DType;
8use vortex_dtype::IntegerPType;
9use vortex_dtype::match_each_integer_ptype;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_err;
15
16use crate::Array;
17use crate::ArrayRef;
18use crate::ToCanonical;
19use crate::arrays::PrimitiveArray;
20use crate::arrays::PrimitiveVTable;
21use crate::arrays::bool;
22use crate::stats::ArrayStats;
23use crate::validity::Validity;
24
25/// The canonical encoding for variable-length list arrays.
26///
27/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
28/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
29///
30/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
31/// since migrated to `ListViewArray` for a few reasons:
32///
33/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
34/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
35/// - Supports different integer types for offsets vs sizes
36///
37/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
38/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
39/// and size in a row-major fashion rather than column-major. More specifically, the row-major
40/// layout has a single child array with alternating offset and size next to each other.
41///
42/// We choose the column-major layout as it allows better compressability, as well as using
43/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
44/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
45/// about flexibility in the logcial type).
46///
47/// # Examples
48///
49/// ```
50/// # fn main() -> vortex_error::VortexResult<()> {
51/// # use vortex_array::arrays::{ListViewArray, PrimitiveArray};
52/// # use vortex_array::validity::Validity;
53/// # use vortex_array::IntoArray;
54/// # use vortex_buffer::buffer;
55/// # use std::sync::Arc;
56/// #
57/// // Create a list view array representing [[3, 4], [1], [2, 3]].
58/// // Note: Unlike `ListArray`, offsets don't need to be monotonic.
59///
60/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
61/// let offsets = buffer![2u32, 0, 1].into_array(); // Out-of-order offsets
62/// let sizes = buffer![2u32, 1, 2].into_array(); // The sizes cause overlaps
63///
64/// let list_view = ListViewArray::new(
65/// elements.into_array(),
66/// offsets.into_array(),
67/// sizes.into_array(),
68/// Validity::NonNullable,
69/// );
70///
71/// assert_eq!(list_view.len(), 3);
72///
73/// // Access individual lists
74/// let first_list = list_view.list_elements_at(0)?;
75/// assert_eq!(first_list.len(), 2);
76/// // First list contains elements[2..4] = [3, 4]
77///
78/// let first_offset = list_view.offset_at(0);
79/// let first_size = list_view.size_at(0);
80/// assert_eq!(first_offset, 2);
81/// assert_eq!(first_size, 2);
82/// # Ok(())
83/// # }
84/// ```
85///
86/// [`ListArray`]: crate::arrays::ListArray
87#[derive(Clone, Debug)]
88pub struct ListViewArray {
89 /// The [`DType`] of the list array.
90 ///
91 /// This type **must** be the variant [`DType::List`].
92 pub(super) dtype: DType,
93
94 /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
95 /// each inner list element is a _scalar_ of the `elements` array.
96 elements: ArrayRef,
97
98 /// The `offsets` array indicating the start position of each list in elements.
99 ///
100 /// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
101 /// (which is different from [`ListArray`](crate::arrays::ListArray)),
102 offsets: ArrayRef,
103
104 /// The `sizes` array indicating the length of each list.
105 ///
106 /// This field is intended to be paired with a corresponding offset to determine the list scalar
107 /// we want to access.
108 sizes: ArrayRef,
109
110 // TODO(connor)[ListView]: Add the n+1 memory allocation optimization.
111 /// A flag denoting if the array is zero-copyable* to a [`ListArray`](crate::arrays::ListArray).
112 ///
113 /// We use this information to help us more efficiently rebuild / compact our data.
114 ///
115 /// When this flag is true (indicating sorted offsets with no gaps and no overlaps and all
116 /// `offsets[i] + sizes[i]` are in order), conversions can bypass the very expensive rebuild
117 /// process which must rebuild the array from scratch.
118 is_zero_copy_to_list: bool,
119
120 /// The validity / null map of the array.
121 ///
122 /// Note that this null map refers to which list scalars are null, **not** which sub-elements of
123 /// list scalars are null. The `elements` array will track individual value nullability.
124 pub(super) validity: Validity,
125
126 /// The stats for this array.
127 pub(super) stats_set: ArrayStats,
128}
129
130pub struct ListViewArrayParts {
131 pub elements_dtype: Arc<DType>,
132
133 /// See `ListViewArray::elements`
134 pub elements: ArrayRef,
135
136 /// See `ListViewArray::offsets`
137 pub offsets: ArrayRef,
138
139 /// See `ListViewArray::sizes`
140 pub sizes: ArrayRef,
141
142 /// See `ListViewArray::validity`
143 pub validity: Validity,
144}
145
146impl ListViewArray {
147 /// Creates a new [`ListViewArray`].
148 ///
149 /// # Panics
150 ///
151 /// Panics if the provided components do not satisfy the invariants documented
152 /// in [`ListViewArray::new_unchecked`].
153 pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
154 Self::try_new(elements, offsets, sizes, validity)
155 .vortex_expect("`ListViewArray` construction failed")
156 }
157
158 /// Constructs a new `ListViewArray`.
159 ///
160 /// # Errors
161 ///
162 /// Returns an error if the provided components do not satisfy the invariants documented
163 /// in [`ListViewArray::new_unchecked`].
164 pub fn try_new(
165 elements: ArrayRef,
166 offsets: ArrayRef,
167 sizes: ArrayRef,
168 validity: Validity,
169 ) -> VortexResult<Self> {
170 Self::validate(&elements, &offsets, &sizes, &validity)?;
171
172 Ok(Self {
173 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
174 elements,
175 offsets,
176 sizes,
177 validity,
178 is_zero_copy_to_list: false,
179 stats_set: Default::default(),
180 })
181 }
182
183 /// Creates a new [`ListViewArray`] without validation.
184 ///
185 /// This unsafe function does not check the validity of the data. Prefer calling [`new()`] or
186 /// [`try_new()`] over this function, as they will check the validity of the data.
187 ///
188 /// [`ListArray`]: crate::arrays::ListArray
189 /// [`new()`]: Self::new
190 /// [`try_new()`]: Self::try_new
191 ///
192 /// # Safety
193 ///
194 /// The caller must ensure all of the following invariants are satisfied:
195 ///
196 /// - `offsets` and `sizes` must be non-nullable integer arrays.
197 /// - `offsets` and `sizes` must have the same length.
198 /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
199 /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`
200 /// (even if the corresponding view is defined as null by the validity array).
201 /// - If validity is an array, its length must equal `offsets.len()`.
202 pub unsafe fn new_unchecked(
203 elements: ArrayRef,
204 offsets: ArrayRef,
205 sizes: ArrayRef,
206 validity: Validity,
207 ) -> Self {
208 if cfg!(debug_assertions) {
209 Self::validate(&elements, &offsets, &sizes, &validity)
210 .vortex_expect("Failed to crate `ListViewArray`");
211 }
212
213 Self {
214 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
215 elements,
216 offsets,
217 sizes,
218 validity,
219 is_zero_copy_to_list: false,
220 stats_set: Default::default(),
221 }
222 }
223
224 /// Validates the components that would be used to create a [`ListViewArray`].
225 pub fn validate(
226 elements: &dyn Array,
227 offsets: &dyn Array,
228 sizes: &dyn Array,
229 validity: &Validity,
230 ) -> VortexResult<()> {
231 // Check that offsets and sizes are integer arrays and non-nullable.
232 vortex_ensure!(
233 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
234 "offsets must be non-nullable integer array, got {}",
235 offsets.dtype()
236 );
237 vortex_ensure!(
238 sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
239 "sizes must be non-nullable integer array, got {}",
240 sizes.dtype()
241 );
242
243 // Check that they have the same length.
244 vortex_ensure!(
245 offsets.len() == sizes.len(),
246 "offsets and sizes must have the same length, got {} and {}",
247 offsets.len(),
248 sizes.len()
249 );
250
251 // Check that the size type can fit within the offset type to prevent overflows.
252 let size_ptype = sizes.dtype().as_ptype();
253 let offset_ptype = offsets.dtype().as_ptype();
254
255 // If a validity array is present, it must be the same length as the `ListViewArray`.
256 if let Some(validity_len) = validity.maybe_len() {
257 vortex_ensure!(
258 validity_len == offsets.len(),
259 "validity with size {validity_len} does not match array size {}",
260 offsets.len()
261 );
262 }
263
264 // Skip host-only validation when offsets/sizes are not host-resident.
265 if offsets.is_host() && sizes.is_host() {
266 let offsets_primitive = offsets.to_primitive();
267 let sizes_primitive = sizes.to_primitive();
268
269 // Validate the `offsets` and `sizes` arrays.
270 match_each_integer_ptype!(offset_ptype, |O| {
271 match_each_integer_ptype!(size_ptype, |S| {
272 let offsets_slice = offsets_primitive.as_slice::<O>();
273 let sizes_slice = sizes_primitive.as_slice::<S>();
274
275 validate_offsets_and_sizes::<O, S>(
276 offsets_slice,
277 sizes_slice,
278 elements.len() as u64,
279 )?;
280 })
281 });
282 }
283
284 Ok(())
285 }
286
287 /// Sets whether this [`ListViewArray`] is zero-copyable to a [`ListArray`].
288 ///
289 /// This is an optimization flag that enables more efficient conversion to [`ListArray`] without
290 /// needing to copy or reorganize the data.
291 ///
292 /// [`ListArray`]: crate::arrays::ListArray
293 ///
294 /// # Safety
295 ///
296 /// When setting `is_zctl` to `true`, the caller must ensure that the [`ListViewArray`] is
297 /// actually zero-copyable to a [`ListArray`]. This means:
298 ///
299 /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
300 /// - `offsets[i] + sizes[i] == offsets[i + 1]` for all `i`.
301 /// - No gaps in elements between first and last referenced elements.
302 /// - No overlapping list views (each element referenced at most once).
303 ///
304 /// Note that leading and trailing unreferenced elements **ARE** allowed.
305 pub unsafe fn with_zero_copy_to_list(mut self, is_zctl: bool) -> Self {
306 if cfg!(debug_assertions) && is_zctl {
307 validate_zctl(
308 &self.elements,
309 self.offsets.to_primitive(),
310 self.sizes.to_primitive(),
311 )
312 .vortex_expect("Failed to validate zero-copy to list flag");
313 }
314 self.is_zero_copy_to_list = is_zctl;
315 self
316 }
317
318 /// Verifies that the `ListViewArray` is zero-copyable to a [`ListArray`].
319 ///
320 /// This will run an expensive validation of the `ListViewArray`'s components. It will check the
321 /// following things:
322 ///
323 /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
324 /// - No gaps in elements between first and last referenced elements.
325 /// - No overlapping list views (each element referenced at most once).
326 ///
327 /// Note that leading and trailing unreferenced elements **ARE** allowed.
328 ///
329 /// This method should really only be called if the caller knows that the `ListViewArray` will
330 /// be converted into a [`ListArray`] in the future, and the caller wants to set the
331 /// optimization flag to `true` with the unsafe [`with_zero_copy_to_list`] method.
332 ///
333 /// [`ListArray`]: crate::arrays::ListArray
334 /// [`with_zero_copy_to_list`]: Self::with_zero_copy_to_list
335 pub fn verify_is_zero_copy_to_list(&self) -> bool {
336 validate_zctl(
337 &self.elements,
338 self.offsets.to_primitive(),
339 self.sizes.to_primitive(),
340 )
341 .is_ok()
342 }
343
344 pub fn into_parts(self) -> ListViewArrayParts {
345 let dtype = self.dtype.into_list_element_opt().vortex_expect("is list");
346 ListViewArrayParts {
347 elements_dtype: dtype,
348 elements: self.elements,
349 offsets: self.offsets,
350 sizes: self.sizes,
351 validity: self.validity,
352 }
353 }
354
355 /// Returns the offset at the given index.
356 ///
357 /// Note that it is possible the corresponding list view is null (which is only defined by the
358 /// validity map). Regardless, we are still guaranteed that this offset is valid by the
359 /// invariants of [`ListViewArray`].
360 pub fn offset_at(&self, index: usize) -> usize {
361 assert!(
362 index < self.len(),
363 "Index {index} out of bounds 0..{}",
364 self.len()
365 );
366
367 // Fast path for `PrimitiveArray`.
368 self.offsets
369 .as_opt::<PrimitiveVTable>()
370 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
371 .unwrap_or_else(|| {
372 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
373 self.offsets
374 .scalar_at(index)
375 .vortex_expect("offsets must support scalar_at")
376 .as_primitive()
377 .as_::<usize>()
378 .vortex_expect("offset must fit in usize")
379 })
380 }
381
382 /// Returns the size at the given index.
383 ///
384 /// Note that it is possible the corresponding list view is null (which is only defined by the
385 /// validity map). Regardless, we are still guaranteed that this size is valid by the invariants
386 /// of [`ListViewArray`].
387 pub fn size_at(&self, index: usize) -> usize {
388 assert!(
389 index < self.len(),
390 "Index {} out of bounds 0..{}",
391 index,
392 self.len()
393 );
394
395 // Fast path for `PrimitiveArray`.
396 self.sizes
397 .as_opt::<PrimitiveVTable>()
398 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
399 .unwrap_or_else(|| {
400 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
401 self.sizes
402 .scalar_at(index)
403 .vortex_expect("sizes must support scalar_at")
404 .as_primitive()
405 .as_::<usize>()
406 .vortex_expect("size must fit in usize")
407 })
408 }
409
410 /// Returns the elements at the given index from the list array.
411 ///
412 /// # Errors
413 ///
414 /// Returns an error if the slice operation fails.
415 pub fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
416 let offset = self.offset_at(index);
417 let size = self.size_at(index);
418 self.elements().slice(offset..offset + size)
419 }
420
421 /// Returns the offsets array.
422 pub fn offsets(&self) -> &ArrayRef {
423 &self.offsets
424 }
425
426 /// Returns the sizes array.
427 pub fn sizes(&self) -> &ArrayRef {
428 &self.sizes
429 }
430
431 /// Returns the elements array.
432 pub fn elements(&self) -> &ArrayRef {
433 &self.elements
434 }
435
436 /// Returns true if the `ListViewArray` is zero-copyable to a
437 /// [`ListArray`](crate::arrays::ListArray).
438 pub fn is_zero_copy_to_list(&self) -> bool {
439 self.is_zero_copy_to_list
440 }
441}
442
443/// Helper function to validate `offsets` and `sizes` with specific types.
444fn validate_offsets_and_sizes<O, S>(
445 offsets_slice: &[O],
446 sizes_slice: &[S],
447 elements_len: u64,
448) -> VortexResult<()>
449where
450 O: IntegerPType,
451 S: IntegerPType,
452{
453 debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
454
455 #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
456 for i in 0..offsets_slice.len() {
457 let offset = offsets_slice[i];
458 let size = sizes_slice[i];
459
460 vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
461 vortex_ensure!(size >= S::zero(), "cannot have negative size");
462
463 let offset_u64 = offset
464 .to_u64()
465 .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
466
467 let size_u64 = size
468 .to_u64()
469 .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
470
471 // Check for overflow when adding offset + size.
472 let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
473 vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
474 })?;
475
476 if offset_u64 == elements_len {
477 vortex_ensure!(
478 size_u64 == 0,
479 "views to the end of the elements array (length {elements_len}) must have size 0 \
480 (had size {size_u64})"
481 );
482 }
483
484 vortex_ensure!(
485 end <= elements_len,
486 "offset[{i}] + size[{i}] = {offset_u64} + {size_u64} = {end} \
487 exceeds elements length {elements_len}",
488 );
489 }
490
491 Ok(())
492}
493
494/// Helper function to validate if the [`ListViewArray`] components are actually zero-copyable to
495/// [`ListArray`](crate::arrays::ListArray).
496fn validate_zctl(
497 elements: &dyn Array,
498 offsets_primitive: PrimitiveArray,
499 sizes_primitive: PrimitiveArray,
500) -> VortexResult<()> {
501 // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed), even
502 // if there are null views.
503 if let Some(is_sorted) = offsets_primitive.statistics().compute_is_sorted() {
504 vortex_ensure!(is_sorted, "offsets must be sorted");
505 } else {
506 vortex_bail!("offsets must report is_sorted statistic");
507 }
508
509 // Validate that offset[i] + size[i] <= offset[i+1] for all items
510 // This ensures views are non-overlapping and properly ordered for zero-copy-to-list
511 fn validate_monotonic_ends<O: IntegerPType, S: IntegerPType>(
512 offsets_slice: &[O],
513 sizes_slice: &[S],
514 len: usize,
515 ) -> VortexResult<()> {
516 let mut max_end = 0usize;
517
518 for i in 0..len {
519 let offset = offsets_slice[i].to_usize().unwrap_or(usize::MAX);
520 let size = sizes_slice[i].to_usize().unwrap_or(usize::MAX);
521
522 // Check that this view starts at or after the previous view ended
523 vortex_ensure!(
524 offset >= max_end,
525 "Zero-copy-to-list requires views to be non-overlapping and ordered: \
526 view[{}] starts at {} but previous views extend to {}",
527 i,
528 offset,
529 max_end
530 );
531
532 // Update max_end for the next iteration
533 let end = offset.saturating_add(size);
534 max_end = max_end.max(end);
535 }
536
537 Ok(())
538 }
539
540 let offsets_dtype = offsets_primitive.dtype();
541 let sizes_dtype = sizes_primitive.dtype();
542 let len = offsets_primitive.len();
543
544 // Check that offset + size values are monotonic (no overlaps)
545 match_each_integer_ptype!(offsets_dtype.as_ptype(), |O| {
546 match_each_integer_ptype!(sizes_dtype.as_ptype(), |S| {
547 let offsets_slice = offsets_primitive.as_slice::<O>();
548 let sizes_slice = sizes_primitive.as_slice::<S>();
549
550 validate_monotonic_ends(offsets_slice, sizes_slice, len)?;
551 })
552 });
553
554 // TODO(connor)[ListView]: Making this allocation is expensive, but the more efficient
555 // implementation would be even more complicated than this. We could use a bit buffer denoting
556 // if positions in `elements` are used, and then additionally store a separate flag that tells
557 // us if a position is used more than once.
558 let mut element_references = vec![0u8; elements.len()];
559
560 fn count_references<O: IntegerPType, S: IntegerPType>(
561 element_references: &mut [u8],
562 offsets_primitive: PrimitiveArray,
563 sizes_primitive: PrimitiveArray,
564 ) {
565 let offsets_slice = offsets_primitive.as_slice::<O>();
566 let sizes_slice = sizes_primitive.as_slice::<S>();
567
568 // Note that we ignore nulls here, as the "null" view metadata must still maintain the same
569 // invariants as non-null views, even for a `bool` information.
570 for i in 0..offsets_slice.len() {
571 let offset: usize = offsets_slice[i].as_();
572 let size: usize = sizes_slice[i].as_();
573 for j in offset..offset + size {
574 element_references[j] = element_references[j].saturating_add(1);
575 }
576 }
577 }
578
579 match_each_integer_ptype!(offsets_primitive.ptype(), |O| {
580 match_each_integer_ptype!(sizes_primitive.ptype(), |S| {
581 count_references::<O, S>(&mut element_references, offsets_primitive, sizes_primitive);
582 })
583 });
584
585 // Allow leading and trailing unreferenced elements, but not gaps in the middle.
586 let leftmost_used = element_references
587 .iter()
588 .position(|&references| references != 0);
589 let rightmost_used = element_references
590 .iter()
591 .rposition(|&references| references != 0);
592
593 if let (Some(first_ref), Some(last_ref)) = (leftmost_used, rightmost_used) {
594 vortex_ensure!(
595 element_references[first_ref..=last_ref]
596 .iter()
597 .all(|&references| references != 0),
598 "found gap in elements array between first and last referenced elements"
599 );
600 }
601
602 vortex_ensure!(element_references.iter().all(|&references| references <= 1));
603
604 Ok(())
605}