vortex_array/arrays/listview/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::{DType, IntegerPType, match_each_integer_ptype};
8use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure, vortex_err};
9
10use crate::arrays::{PrimitiveArray, PrimitiveVTable, bool};
11use crate::stats::ArrayStats;
12use crate::validity::Validity;
13use crate::{Array, ArrayRef, ToCanonical};
14
15/// The canonical encoding for variable-length list arrays.
16///
17/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
18/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
19///
20/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
21/// since migrated to `ListViewArray` for a few reasons:
22///
23/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
24/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
25/// - Supports different integer types for offsets vs sizes
26///
27/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
28/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
29/// and size in a row-major fashion rather than column-major. More specifically, the row-major
30/// layout has a single child array with alternating offset and size next to each other.
31///
32/// We choose the column-major layout as it allows better compressability, as well as using
33/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
34/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
35/// about flexibility in the logcial type).
36///
37/// # Examples
38///
39/// ```
40/// # use vortex_array::arrays::{ListViewArray, PrimitiveArray};
41/// # use vortex_array::validity::Validity;
42/// # use vortex_array::IntoArray;
43/// # use vortex_buffer::buffer;
44/// # use std::sync::Arc;
45/// #
46/// // Create a list view array representing [[3, 4], [1], [2, 3]].
47/// // Note: Unlike `ListArray`, offsets don't need to be monotonic.
48///
49/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
50/// let offsets = buffer![2u32, 0, 1].into_array(); // Out-of-order offsets
51/// let sizes = buffer![2u32, 1, 2].into_array(); // The sizes cause overlaps
52///
53/// let list_view = ListViewArray::new(
54/// elements.into_array(),
55/// offsets.into_array(),
56/// sizes.into_array(),
57/// Validity::NonNullable,
58/// );
59///
60/// assert_eq!(list_view.len(), 3);
61///
62/// // Access individual lists
63/// let first_list = list_view.list_elements_at(0);
64/// assert_eq!(first_list.len(), 2);
65/// // First list contains elements[2..4] = [3, 4]
66///
67/// let first_offset = list_view.offset_at(0);
68/// let first_size = list_view.size_at(0);
69/// assert_eq!(first_offset, 2);
70/// assert_eq!(first_size, 2);
71/// ```
72///
73/// [`ListArray`]: crate::arrays::ListArray
74#[derive(Clone, Debug)]
75pub struct ListViewArray {
76 /// The [`DType`] of the list array.
77 ///
78 /// This type **must** be the variant [`DType::List`].
79 pub(super) dtype: DType,
80
81 /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
82 /// each inner list element is a _scalar_ of the `elements` array.
83 elements: ArrayRef,
84
85 /// The `offsets` array indicating the start position of each list in elements.
86 ///
87 /// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
88 /// (which is different from [`ListArray`](crate::arrays::ListArray)),
89 offsets: ArrayRef,
90
91 /// The `sizes` array indicating the length of each list.
92 ///
93 /// This field is intended to be paired with a corresponding offset to determine the list scalar
94 /// we want to access.
95 sizes: ArrayRef,
96
97 // TODO(connor)[ListView]: Add the n+1 memory allocation optimization.
98 /// A flag denoting if the array is zero-copyable* to a [`ListArray`](crate::arrays::ListArray).
99 ///
100 /// We use this information to help us more efficiently rebuild / compact our data.
101 ///
102 /// When this flag is true (indicating sorted offsets with no gaps and no overlaps), conversions
103 /// can bypass the very expensive rebuild process (which just calls `append_scalar` in a loop).
104 is_zero_copy_to_list: bool,
105
106 /// The validity / null map of the array.
107 ///
108 /// Note that this null map refers to which list scalars are null, **not** which sub-elements of
109 /// list scalars are null. The `elements` array will track individual value nullability.
110 pub(super) validity: Validity,
111
112 /// The stats for this array.
113 pub(super) stats_set: ArrayStats,
114}
115
116impl ListViewArray {
117 /// Creates a new [`ListViewArray`].
118 ///
119 /// # Panics
120 ///
121 /// Panics if the provided components do not satisfy the invariants documented
122 /// in [`ListViewArray::new_unchecked`].
123 pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
124 Self::try_new(elements, offsets, sizes, validity)
125 .vortex_expect("`ListViewArray` construction failed")
126 }
127
128 /// Constructs a new `ListViewArray`.
129 ///
130 /// # Errors
131 ///
132 /// Returns an error if the provided components do not satisfy the invariants documented
133 /// in [`ListViewArray::new_unchecked`].
134 pub fn try_new(
135 elements: ArrayRef,
136 offsets: ArrayRef,
137 sizes: ArrayRef,
138 validity: Validity,
139 ) -> VortexResult<Self> {
140 Self::validate(&elements, &offsets, &sizes, &validity)?;
141
142 Ok(Self {
143 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
144 elements,
145 offsets,
146 sizes,
147 validity,
148 is_zero_copy_to_list: false,
149 stats_set: Default::default(),
150 })
151 }
152
153 /// Creates a new [`ListViewArray`] without validation.
154 ///
155 /// This unsafe function does not check the validity of the data. Prefer calling [`new()`] or
156 /// [`try_new()`] over this function, as they will check the validity of the data.
157 ///
158 /// [`ListArray`]: crate::arrays::ListArray
159 /// [`new()`]: Self::new
160 /// [`try_new()`]: Self::try_new
161 ///
162 /// # Safety
163 ///
164 /// The caller must ensure all of the following invariants are satisfied:
165 ///
166 /// - `offsets` and `sizes` must be non-nullable integer arrays.
167 /// - `offsets` and `sizes` must have the same length.
168 /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
169 /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`
170 /// (even if the corresponding view is defined as null by the validity array).
171 /// - If validity is an array, its length must equal `offsets.len()`.
172 pub unsafe fn new_unchecked(
173 elements: ArrayRef,
174 offsets: ArrayRef,
175 sizes: ArrayRef,
176 validity: Validity,
177 ) -> Self {
178 if cfg!(debug_assertions) {
179 Self::validate(&elements, &offsets, &sizes, &validity)
180 .vortex_expect("Failed to crate `ListViewArray`");
181 }
182
183 Self {
184 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
185 elements,
186 offsets,
187 sizes,
188 validity,
189 is_zero_copy_to_list: false,
190 stats_set: Default::default(),
191 }
192 }
193
194 /// Validates the components that would be used to create a [`ListViewArray`].
195 pub fn validate(
196 elements: &dyn Array,
197 offsets: &dyn Array,
198 sizes: &dyn Array,
199 validity: &Validity,
200 ) -> VortexResult<()> {
201 // Check that offsets and sizes are integer arrays and non-nullable.
202 vortex_ensure!(
203 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
204 "offsets must be non-nullable integer array, got {}",
205 offsets.dtype()
206 );
207 vortex_ensure!(
208 sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
209 "sizes must be non-nullable integer array, got {}",
210 sizes.dtype()
211 );
212
213 // Check that they have the same length.
214 vortex_ensure!(
215 offsets.len() == sizes.len(),
216 "offsets and sizes must have the same length, got {} and {}",
217 offsets.len(),
218 sizes.len()
219 );
220
221 // Check that the size type can fit within the offset type to prevent overflows.
222 let size_ptype = sizes.dtype().as_ptype();
223 let offset_ptype = offsets.dtype().as_ptype();
224
225 // If a validity array is present, it must be the same length as the `ListViewArray`.
226 if let Some(validity_len) = validity.maybe_len() {
227 vortex_ensure!(
228 validity_len == offsets.len(),
229 "validity with size {validity_len} does not match array size {}",
230 offsets.len()
231 );
232 }
233
234 let offsets_primitive = offsets.to_primitive();
235 let sizes_primitive = sizes.to_primitive();
236
237 // Validate the `offsets` and `sizes` arrays.
238 match_each_integer_ptype!(offset_ptype, |O| {
239 match_each_integer_ptype!(size_ptype, |S| {
240 let offsets_slice = offsets_primitive.as_slice::<O>();
241 let sizes_slice = sizes_primitive.as_slice::<S>();
242
243 validate_offsets_and_sizes::<O, S>(
244 offsets_slice,
245 sizes_slice,
246 elements.len() as u64,
247 )?;
248 })
249 });
250
251 Ok(())
252 }
253
254 /// Sets whether this [`ListViewArray`] is zero-copyable to a [`ListArray`].
255 ///
256 /// This is an optimization flag that enables more efficient conversion to [`ListArray`] without
257 /// needing to copy or reorganize the data.
258 ///
259 /// [`ListArray`]: crate::arrays::ListArray
260 ///
261 /// # Safety
262 ///
263 /// When setting `is_zctl` to `true`, the caller must ensure that the [`ListViewArray`] is
264 /// actually zero-copyable to a [`ListArray`]. This means:
265 ///
266 /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
267 /// - No gaps in elements between first and last referenced elements.
268 /// - No overlapping list views (each element referenced at most once).
269 ///
270 /// Note that leading and trailing unreferenced elements **ARE** allowed.
271 pub unsafe fn with_zero_copy_to_list(mut self, is_zctl: bool) -> Self {
272 if cfg!(debug_assertions) && is_zctl {
273 validate_zctl(
274 &self.elements,
275 self.offsets.to_primitive(),
276 self.sizes.to_primitive(),
277 )
278 .vortex_expect("Failed to validate zero-copy to list flag");
279 }
280 self.is_zero_copy_to_list = is_zctl;
281 self
282 }
283
284 /// Verifies that the `ListViewArray` is zero-copyable to a [`ListArray`].
285 ///
286 /// This will run an expensive validation of the `ListViewArray`'s components. It will check the
287 /// following things:
288 ///
289 /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
290 /// - No gaps in elements between first and last referenced elements.
291 /// - No overlapping list views (each element referenced at most once).
292 ///
293 /// Note that leading and trailing unreferenced elements **ARE** allowed.
294 ///
295 /// This method should really only be called if the caller knows that the `ListViewArray` will
296 /// be converted into a [`ListArray`] in the future, and the caller wants to set the
297 /// optimization flag to `true` with the unsafe [`with_zero_copy_to_list`] method.
298 ///
299 /// [`ListArray`]: crate::arrays::ListArray
300 /// [`with_zero_copy_to_list`]: Self::with_zero_copy_to_list
301 pub fn verify_is_zero_copy_to_list(&self) -> bool {
302 validate_zctl(
303 &self.elements,
304 self.offsets.to_primitive(),
305 self.sizes.to_primitive(),
306 )
307 .is_ok()
308 }
309
310 /// Returns the offset at the given index.
311 ///
312 /// Note that it is possible the corresponding list view is null (which is only defined by the
313 /// validity map). Regardless, we are still guaranteed that this offset is valid by the
314 /// invariants of [`ListViewArray`].
315 pub fn offset_at(&self, index: usize) -> usize {
316 assert!(
317 index < self.len(),
318 "Index {index} out of bounds 0..{}",
319 self.len()
320 );
321
322 // Fast path for `PrimitiveArray`.
323 self.offsets
324 .as_opt::<PrimitiveVTable>()
325 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
326 .unwrap_or_else(|| {
327 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
328 self.offsets
329 .scalar_at(index)
330 .as_primitive()
331 .as_::<usize>()
332 .vortex_expect("offset must fit in usize")
333 })
334 }
335
336 /// Returns the size at the given index.
337 ///
338 /// Note that it is possible the corresponding list view is null (which is only defined by the
339 /// validity map). Regardless, we are still guaranteed that this size is valid by the invariants
340 /// of [`ListViewArray`].
341 pub fn size_at(&self, index: usize) -> usize {
342 assert!(
343 index < self.len(),
344 "Index {} out of bounds 0..{}",
345 index,
346 self.len()
347 );
348
349 // Fast path for `PrimitiveArray`.
350 self.sizes
351 .as_opt::<PrimitiveVTable>()
352 .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
353 .unwrap_or_else(|| {
354 // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
355 self.sizes
356 .scalar_at(index)
357 .as_primitive()
358 .as_::<usize>()
359 .vortex_expect("size must fit in usize")
360 })
361 }
362
363 /// Returns the elements at the given index from the list array.
364 pub fn list_elements_at(&self, index: usize) -> ArrayRef {
365 let offset = self.offset_at(index);
366 let size = self.size_at(index);
367 self.elements().slice(offset..offset + size)
368 }
369
370 /// Returns the offsets array.
371 pub fn offsets(&self) -> &ArrayRef {
372 &self.offsets
373 }
374
375 /// Returns the sizes array.
376 pub fn sizes(&self) -> &ArrayRef {
377 &self.sizes
378 }
379
380 /// Returns the elements array.
381 pub fn elements(&self) -> &ArrayRef {
382 &self.elements
383 }
384
385 /// Returns true if the `ListViewArray` is zero-copyable to a
386 /// [`ListArray`](crate::arrays::ListArray).
387 pub fn is_zero_copy_to_list(&self) -> bool {
388 self.is_zero_copy_to_list
389 }
390}
391
392/// Helper function to validate `offsets` and `sizes` with specific types.
393fn validate_offsets_and_sizes<O, S>(
394 offsets_slice: &[O],
395 sizes_slice: &[S],
396 elements_len: u64,
397) -> VortexResult<()>
398where
399 O: IntegerPType,
400 S: IntegerPType,
401{
402 debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
403
404 #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
405 for i in 0..offsets_slice.len() {
406 let offset = offsets_slice[i];
407 let size = sizes_slice[i];
408
409 vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
410 vortex_ensure!(size >= S::zero(), "cannot have negative size");
411
412 let offset_u64 = offset
413 .to_u64()
414 .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
415
416 let size_u64 = size
417 .to_u64()
418 .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
419
420 // Check for overflow when adding offset + size.
421 let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
422 vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
423 })?;
424
425 if offset_u64 == elements_len {
426 vortex_ensure!(
427 size_u64 == 0,
428 "views to the end of the elements array (length {elements_len}) must have size 0"
429 );
430 }
431
432 vortex_ensure!(
433 end <= elements_len,
434 "offset[{i}] + size[{i}] = {offset_u64} + {size_u64} = {end} \
435 exceeds elements length {elements_len}",
436 );
437 }
438
439 Ok(())
440}
441
442/// Helper function to validate if the [`ListViewArray`] components are actually zero-copyable to
443/// [`ListArray`](crate::arrays::ListArray).
444fn validate_zctl(
445 elements: &dyn Array,
446 offsets_primitive: PrimitiveArray,
447 sizes_primitive: PrimitiveArray,
448) -> VortexResult<()> {
449 // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed), even
450 // if there are null views.
451 if let Some(is_sorted) = offsets_primitive.statistics().compute_is_sorted() {
452 vortex_ensure!(is_sorted, "offsets must be sorted");
453 } else {
454 vortex_bail!("offsets must report is_sorted statistic");
455 }
456
457 // TODO(connor)[ListView]: Making this allocation is expensive, but the more efficient
458 // implementation would be even more complicated than this. We could use a bit buffer denoting
459 // if positions in `elements` are used, and then additionally store a separate flag that tells
460 // us if a position is used more than once.
461 let mut element_references = vec![0u8; elements.len()];
462
463 fn count_references<O: IntegerPType, S: IntegerPType>(
464 element_references: &mut [u8],
465 offsets_primitive: PrimitiveArray,
466 sizes_primitive: PrimitiveArray,
467 ) {
468 let offsets_slice = offsets_primitive.as_slice::<O>();
469 let sizes_slice = sizes_primitive.as_slice::<S>();
470
471 // Note that we ignore nulls here, as the "null" view metadata must still maintain the same
472 // invariants as non-null views, even for a `bool` information.
473 for i in 0..offsets_slice.len() {
474 let offset: usize = offsets_slice[i].as_();
475 let size: usize = sizes_slice[i].as_();
476 for j in offset..offset + size {
477 element_references[j] = element_references[j].saturating_add(1);
478 }
479 }
480 }
481
482 match_each_integer_ptype!(offsets_primitive.ptype(), |O| {
483 match_each_integer_ptype!(sizes_primitive.ptype(), |S| {
484 count_references::<O, S>(&mut element_references, offsets_primitive, sizes_primitive);
485 })
486 });
487
488 // Allow leading and trailing unreferenced elements, but not gaps in the middle.
489 let leftmost_used = element_references
490 .iter()
491 .position(|&references| references != 0);
492 let rightmost_used = element_references
493 .iter()
494 .rposition(|&references| references != 0);
495
496 if let (Some(first_ref), Some(last_ref)) = (leftmost_used, rightmost_used) {
497 vortex_ensure!(
498 element_references[first_ref..=last_ref]
499 .iter()
500 .all(|&references| references != 0),
501 "found gap in elements array between first and last referenced elements"
502 );
503 }
504
505 vortex_ensure!(element_references.iter().all(|&references| references <= 1));
506
507 Ok(())
508}