vortex_array/arrays/listview/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::DType;
8use vortex_dtype::IntegerPType;
9use vortex_dtype::match_each_integer_ptype;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_err;
15
16use crate::Array;
17use crate::ArrayRef;
18use crate::ToCanonical;
19use crate::arrays::PrimitiveArray;
20use crate::arrays::PrimitiveVTable;
21use crate::arrays::bool;
22use crate::stats::ArrayStats;
23use crate::validity::Validity;
24
25/// The canonical encoding for variable-length list arrays.
26///
27/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
28/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
29///
30/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
31/// since migrated to `ListViewArray` for a few reasons:
32///
33/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
34/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
35/// - Supports different integer types for offsets vs sizes
36///
37/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
38/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
39/// and size in a row-major fashion rather than column-major. More specifically, the row-major
40/// layout has a single child array with alternating offset and size next to each other.
41///
42/// We choose the column-major layout as it allows better compressability, as well as using
43/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
44/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
45/// about flexibility in the logcial type).
46///
47/// # Examples
48///
49/// ```
50/// # use vortex_array::arrays::{ListViewArray, PrimitiveArray};
51/// # use vortex_array::validity::Validity;
52/// # use vortex_array::IntoArray;
53/// # use vortex_buffer::buffer;
54/// # use std::sync::Arc;
55/// #
56/// // Create a list view array representing [[3, 4], [1], [2, 3]].
57/// // Note: Unlike `ListArray`, offsets don't need to be monotonic.
58///
59/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
60/// let offsets = buffer![2u32, 0, 1].into_array(); // Out-of-order offsets
61/// let sizes = buffer![2u32, 1, 2].into_array(); // The sizes cause overlaps
62///
63/// let list_view = ListViewArray::new(
64/// elements.into_array(),
65/// offsets.into_array(),
66/// sizes.into_array(),
67/// Validity::NonNullable,
68/// );
69///
70/// assert_eq!(list_view.len(), 3);
71///
72/// // Access individual lists
73/// let first_list = list_view.list_elements_at(0);
74/// assert_eq!(first_list.len(), 2);
75/// // First list contains elements[2..4] = [3, 4]
76///
77/// let first_offset = list_view.offset_at(0);
78/// let first_size = list_view.size_at(0);
79/// assert_eq!(first_offset, 2);
80/// assert_eq!(first_size, 2);
81/// ```
82///
83/// [`ListArray`]: crate::arrays::ListArray
84#[derive(Clone, Debug)]
85pub struct ListViewArray {
86 /// The [`DType`] of the list array.
87 ///
88 /// This type **must** be the variant [`DType::List`].
89 pub(super) dtype: DType,
90
91 /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
92 /// each inner list element is a _scalar_ of the `elements` array.
93 elements: ArrayRef,
94
95 /// The `offsets` array indicating the start position of each list in elements.
96 ///
97 /// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
98 /// (which is different from [`ListArray`](crate::arrays::ListArray)),
99 offsets: ArrayRef,
100
101 /// The `sizes` array indicating the length of each list.
102 ///
103 /// This field is intended to be paired with a corresponding offset to determine the list scalar
104 /// we want to access.
105 sizes: ArrayRef,
106
107 // TODO(connor)[ListView]: Add the n+1 memory allocation optimization.
108 /// A flag denoting if the array is zero-copyable* to a [`ListArray`](crate::arrays::ListArray).
109 ///
110 /// We use this information to help us more efficiently rebuild / compact our data.
111 ///
112 /// When this flag is true (indicating sorted offsets with no gaps and no overlaps and all
113 /// `offsets[i] + sizes[i]` are in order), conversions can bypass the very expensive rebuild
114 /// process which must rebuild the array from scratch.
115 is_zero_copy_to_list: bool,
116
117 /// The validity / null map of the array.
118 ///
119 /// Note that this null map refers to which list scalars are null, **not** which sub-elements of
120 /// list scalars are null. The `elements` array will track individual value nullability.
121 pub(super) validity: Validity,
122
123 /// The stats for this array.
124 pub(super) stats_set: ArrayStats,
125}
126
127impl ListViewArray {
128 /// Creates a new [`ListViewArray`].
129 ///
130 /// # Panics
131 ///
132 /// Panics if the provided components do not satisfy the invariants documented
133 /// in [`ListViewArray::new_unchecked`].
134 pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
135 Self::try_new(elements, offsets, sizes, validity)
136 .vortex_expect("`ListViewArray` construction failed")
137 }
138
139 /// Constructs a new `ListViewArray`.
140 ///
141 /// # Errors
142 ///
143 /// Returns an error if the provided components do not satisfy the invariants documented
144 /// in [`ListViewArray::new_unchecked`].
145 pub fn try_new(
146 elements: ArrayRef,
147 offsets: ArrayRef,
148 sizes: ArrayRef,
149 validity: Validity,
150 ) -> VortexResult<Self> {
151 Self::validate(&elements, &offsets, &sizes, &validity)?;
152
153 Ok(Self {
154 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
155 elements,
156 offsets,
157 sizes,
158 validity,
159 is_zero_copy_to_list: false,
160 stats_set: Default::default(),
161 })
162 }
163
164 /// Creates a new [`ListViewArray`] without validation.
165 ///
166 /// This unsafe function does not check the validity of the data. Prefer calling [`new()`] or
167 /// [`try_new()`] over this function, as they will check the validity of the data.
168 ///
169 /// [`ListArray`]: crate::arrays::ListArray
170 /// [`new()`]: Self::new
171 /// [`try_new()`]: Self::try_new
172 ///
173 /// # Safety
174 ///
175 /// The caller must ensure all of the following invariants are satisfied:
176 ///
177 /// - `offsets` and `sizes` must be non-nullable integer arrays.
178 /// - `offsets` and `sizes` must have the same length.
179 /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
180 /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`
181 /// (even if the corresponding view is defined as null by the validity array).
182 /// - If validity is an array, its length must equal `offsets.len()`.
183 pub unsafe fn new_unchecked(
184 elements: ArrayRef,
185 offsets: ArrayRef,
186 sizes: ArrayRef,
187 validity: Validity,
188 ) -> Self {
189 if cfg!(debug_assertions) {
190 Self::validate(&elements, &offsets, &sizes, &validity)
191 .vortex_expect("Failed to crate `ListViewArray`");
192 }
193
194 Self {
195 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
196 elements,
197 offsets,
198 sizes,
199 validity,
200 is_zero_copy_to_list: false,
201 stats_set: Default::default(),
202 }
203 }
204
205 /// Validates the components that would be used to create a [`ListViewArray`].
206 pub fn validate(
207 elements: &dyn Array,
208 offsets: &dyn Array,
209 sizes: &dyn Array,
210 validity: &Validity,
211 ) -> VortexResult<()> {
212 // Check that offsets and sizes are integer arrays and non-nullable.
213 vortex_ensure!(
214 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
215 "offsets must be non-nullable integer array, got {}",
216 offsets.dtype()
217 );
218 vortex_ensure!(
219 sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
220 "sizes must be non-nullable integer array, got {}",
221 sizes.dtype()
222 );
223
224 // Check that they have the same length.
225 vortex_ensure!(
226 offsets.len() == sizes.len(),
227 "offsets and sizes must have the same length, got {} and {}",
228 offsets.len(),
229 sizes.len()
230 );
231
232 // Check that the size type can fit within the offset type to prevent overflows.
233 let size_ptype = sizes.dtype().as_ptype();
234 let offset_ptype = offsets.dtype().as_ptype();
235
236 // If a validity array is present, it must be the same length as the `ListViewArray`.
237 if let Some(validity_len) = validity.maybe_len() {
238 vortex_ensure!(
239 validity_len == offsets.len(),
240 "validity with size {validity_len} does not match array size {}",
241 offsets.len()
242 );
243 }
244
245 let offsets_primitive = offsets.to_primitive();
246 let sizes_primitive = sizes.to_primitive();
247
248 // Validate the `offsets` and `sizes` arrays.
249 match_each_integer_ptype!(offset_ptype, |O| {
250 match_each_integer_ptype!(size_ptype, |S| {
251 let offsets_slice = offsets_primitive.as_slice::<O>();
252 let sizes_slice = sizes_primitive.as_slice::<S>();
253
254 validate_offsets_and_sizes::<O, S>(
255 offsets_slice,
256 sizes_slice,
257 elements.len() as u64,
258 )?;
259 })
260 });
261
262 Ok(())
263 }
264
265 /// Sets whether this [`ListViewArray`] is zero-copyable to a [`ListArray`].
266 ///
267 /// This is an optimization flag that enables more efficient conversion to [`ListArray`] without
268 /// needing to copy or reorganize the data.
269 ///
270 /// [`ListArray`]: crate::arrays::ListArray
271 ///
272 /// # Safety
273 ///
274 /// When setting `is_zctl` to `true`, the caller must ensure that the [`ListViewArray`] is
275 /// actually zero-copyable to a [`ListArray`]. This means:
276 ///
277 /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
278 /// - `offsets[i] + sizes[i] == offsets[i + 1]` for all `i`.
279 /// - No gaps in elements between first and last referenced elements.
280 /// - No overlapping list views (each element referenced at most once).
281 ///
282 /// Note that leading and trailing unreferenced elements **ARE** allowed.
283 pub unsafe fn with_zero_copy_to_list(mut self, is_zctl: bool) -> Self {
284 if cfg!(debug_assertions) && is_zctl {
285 validate_zctl(
286 &self.elements,
287 self.offsets.to_primitive(),
288 self.sizes.to_primitive(),
289 )
290 .vortex_expect("Failed to validate zero-copy to list flag");
291 }
292 self.is_zero_copy_to_list = is_zctl;
293 self
294 }
295
296 /// Verifies that the `ListViewArray` is zero-copyable to a [`ListArray`].
297 ///
298 /// This will run an expensive validation of the `ListViewArray`'s components. It will check the
299 /// following things:
300 ///
301 /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
302 /// - No gaps in elements between first and last referenced elements.
303 /// - No overlapping list views (each element referenced at most once).
304 ///
305 /// Note that leading and trailing unreferenced elements **ARE** allowed.
306 ///
307 /// This method should really only be called if the caller knows that the `ListViewArray` will
308 /// be converted into a [`ListArray`] in the future, and the caller wants to set the
309 /// optimization flag to `true` with the unsafe [`with_zero_copy_to_list`] method.
310 ///
311 /// [`ListArray`]: crate::arrays::ListArray
312 /// [`with_zero_copy_to_list`]: Self::with_zero_copy_to_list
313 pub fn verify_is_zero_copy_to_list(&self) -> bool {
314 validate_zctl(
315 &self.elements,
316 self.offsets.to_primitive(),
317 self.sizes.to_primitive(),
318 )
319 .is_ok()
320 }
321
322 pub fn into_parts(self) -> (ArrayRef, ArrayRef, ArrayRef, Validity) {
323 (self.elements, self.offsets, self.sizes, self.validity)
324 }
325
326 /// Returns the offset at the given index.
327 ///
328 /// Note that it is possible the corresponding list view is null (which is only defined by the
329 /// validity map). Regardless, we are still guaranteed that this offset is valid by the
330 /// invariants of [`ListViewArray`].
331 pub fn offset_at(&self, index: usize) -> usize {
332 assert!(
333 index < self.len(),
334 "Index {index} out of bounds 0..{}",
335 self.len()
336 );
337
338 // Fast path for `PrimitiveArray`.
339 self.offsets
340 .as_opt::<PrimitiveVTable>()
341 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
342 .unwrap_or_else(|| {
343 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
344 self.offsets
345 .scalar_at(index)
346 .as_primitive()
347 .as_::<usize>()
348 .vortex_expect("offset must fit in usize")
349 })
350 }
351
352 /// Returns the size at the given index.
353 ///
354 /// Note that it is possible the corresponding list view is null (which is only defined by the
355 /// validity map). Regardless, we are still guaranteed that this size is valid by the invariants
356 /// of [`ListViewArray`].
357 pub fn size_at(&self, index: usize) -> usize {
358 assert!(
359 index < self.len(),
360 "Index {} out of bounds 0..{}",
361 index,
362 self.len()
363 );
364
365 // Fast path for `PrimitiveArray`.
366 self.sizes
367 .as_opt::<PrimitiveVTable>()
368 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
369 .unwrap_or_else(|| {
370 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
371 self.sizes
372 .scalar_at(index)
373 .as_primitive()
374 .as_::<usize>()
375 .vortex_expect("size must fit in usize")
376 })
377 }
378
379 /// Returns the elements at the given index from the list array.
380 pub fn list_elements_at(&self, index: usize) -> ArrayRef {
381 let offset = self.offset_at(index);
382 let size = self.size_at(index);
383 self.elements().slice(offset..offset + size)
384 }
385
386 /// Returns the offsets array.
387 pub fn offsets(&self) -> &ArrayRef {
388 &self.offsets
389 }
390
391 /// Returns the sizes array.
392 pub fn sizes(&self) -> &ArrayRef {
393 &self.sizes
394 }
395
396 /// Returns the elements array.
397 pub fn elements(&self) -> &ArrayRef {
398 &self.elements
399 }
400
401 /// Returns true if the `ListViewArray` is zero-copyable to a
402 /// [`ListArray`](crate::arrays::ListArray).
403 pub fn is_zero_copy_to_list(&self) -> bool {
404 self.is_zero_copy_to_list
405 }
406}
407
408/// Helper function to validate `offsets` and `sizes` with specific types.
409fn validate_offsets_and_sizes<O, S>(
410 offsets_slice: &[O],
411 sizes_slice: &[S],
412 elements_len: u64,
413) -> VortexResult<()>
414where
415 O: IntegerPType,
416 S: IntegerPType,
417{
418 debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
419
420 #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
421 for i in 0..offsets_slice.len() {
422 let offset = offsets_slice[i];
423 let size = sizes_slice[i];
424
425 vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
426 vortex_ensure!(size >= S::zero(), "cannot have negative size");
427
428 let offset_u64 = offset
429 .to_u64()
430 .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
431
432 let size_u64 = size
433 .to_u64()
434 .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
435
436 // Check for overflow when adding offset + size.
437 let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
438 vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
439 })?;
440
441 if offset_u64 == elements_len {
442 vortex_ensure!(
443 size_u64 == 0,
444 "views to the end of the elements array (length {elements_len}) must have size 0 \
445 (had size {size_u64})"
446 );
447 }
448
449 vortex_ensure!(
450 end <= elements_len,
451 "offset[{i}] + size[{i}] = {offset_u64} + {size_u64} = {end} \
452 exceeds elements length {elements_len}",
453 );
454 }
455
456 Ok(())
457}
458
459/// Helper function to validate if the [`ListViewArray`] components are actually zero-copyable to
460/// [`ListArray`](crate::arrays::ListArray).
461fn validate_zctl(
462 elements: &dyn Array,
463 offsets_primitive: PrimitiveArray,
464 sizes_primitive: PrimitiveArray,
465) -> VortexResult<()> {
466 // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed), even
467 // if there are null views.
468 if let Some(is_sorted) = offsets_primitive.statistics().compute_is_sorted() {
469 vortex_ensure!(is_sorted, "offsets must be sorted");
470 } else {
471 vortex_bail!("offsets must report is_sorted statistic");
472 }
473
474 // Validate that offset[i] + size[i] <= offset[i+1] for all items
475 // This ensures views are non-overlapping and properly ordered for zero-copy-to-list
476 fn validate_monotonic_ends<O: IntegerPType, S: IntegerPType>(
477 offsets_slice: &[O],
478 sizes_slice: &[S],
479 len: usize,
480 ) -> VortexResult<()> {
481 let mut max_end = 0usize;
482
483 for i in 0..len {
484 let offset = offsets_slice[i].to_usize().unwrap_or(usize::MAX);
485 let size = sizes_slice[i].to_usize().unwrap_or(usize::MAX);
486
487 // Check that this view starts at or after the previous view ended
488 vortex_ensure!(
489 offset >= max_end,
490 "Zero-copy-to-list requires views to be non-overlapping and ordered: \
491 view[{}] starts at {} but previous views extend to {}",
492 i,
493 offset,
494 max_end
495 );
496
497 // Update max_end for the next iteration
498 let end = offset.saturating_add(size);
499 max_end = max_end.max(end);
500 }
501
502 Ok(())
503 }
504
505 let offsets_dtype = offsets_primitive.dtype();
506 let sizes_dtype = sizes_primitive.dtype();
507 let len = offsets_primitive.len();
508
509 // Check that offset + size values are monotonic (no overlaps)
510 match_each_integer_ptype!(offsets_dtype.as_ptype(), |O| {
511 match_each_integer_ptype!(sizes_dtype.as_ptype(), |S| {
512 let offsets_slice = offsets_primitive.as_slice::<O>();
513 let sizes_slice = sizes_primitive.as_slice::<S>();
514
515 validate_monotonic_ends(offsets_slice, sizes_slice, len)?;
516 })
517 });
518
519 // TODO(connor)[ListView]: Making this allocation is expensive, but the more efficient
520 // implementation would be even more complicated than this. We could use a bit buffer denoting
521 // if positions in `elements` are used, and then additionally store a separate flag that tells
522 // us if a position is used more than once.
523 let mut element_references = vec![0u8; elements.len()];
524
525 fn count_references<O: IntegerPType, S: IntegerPType>(
526 element_references: &mut [u8],
527 offsets_primitive: PrimitiveArray,
528 sizes_primitive: PrimitiveArray,
529 ) {
530 let offsets_slice = offsets_primitive.as_slice::<O>();
531 let sizes_slice = sizes_primitive.as_slice::<S>();
532
533 // Note that we ignore nulls here, as the "null" view metadata must still maintain the same
534 // invariants as non-null views, even for a `bool` information.
535 for i in 0..offsets_slice.len() {
536 let offset: usize = offsets_slice[i].as_();
537 let size: usize = sizes_slice[i].as_();
538 for j in offset..offset + size {
539 element_references[j] = element_references[j].saturating_add(1);
540 }
541 }
542 }
543
544 match_each_integer_ptype!(offsets_primitive.ptype(), |O| {
545 match_each_integer_ptype!(sizes_primitive.ptype(), |S| {
546 count_references::<O, S>(&mut element_references, offsets_primitive, sizes_primitive);
547 })
548 });
549
550 // Allow leading and trailing unreferenced elements, but not gaps in the middle.
551 let leftmost_used = element_references
552 .iter()
553 .position(|&references| references != 0);
554 let rightmost_used = element_references
555 .iter()
556 .rposition(|&references| references != 0);
557
558 if let (Some(first_ref), Some(last_ref)) = (leftmost_used, rightmost_used) {
559 vortex_ensure!(
560 element_references[first_ref..=last_ref]
561 .iter()
562 .all(|&references| references != 0),
563 "found gap in elements array between first and last referenced elements"
564 );
565 }
566
567 vortex_ensure!(element_references.iter().all(|&references| references <= 1));
568
569 Ok(())
570}