Skip to main content

vortex_array/arrays/listview/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::DType;
8use vortex_dtype::IntegerPType;
9use vortex_dtype::match_each_integer_ptype;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_err;
15
16use crate::Array;
17use crate::ArrayRef;
18use crate::ToCanonical;
19use crate::arrays::PrimitiveArray;
20use crate::arrays::PrimitiveVTable;
21use crate::arrays::bool;
22use crate::stats::ArrayStats;
23use crate::validity::Validity;
24
25/// The canonical encoding for variable-length list arrays.
26///
27/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
28/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
29///
30/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
31/// since migrated to `ListViewArray` for a few reasons:
32///
33/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
34/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
35/// - Supports different integer types for offsets vs sizes
36///
37/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
38/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
39/// and size in a row-major fashion rather than column-major. More specifically, the row-major
40/// layout has a single child array with alternating offset and size next to each other.
41///
42/// We choose the column-major layout as it allows better compressability, as well as using
43/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
44/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
45/// about flexibility in the logcial type).
46///
47/// # Examples
48///
49/// ```
50/// # fn main() -> vortex_error::VortexResult<()> {
51/// # use vortex_array::arrays::{ListViewArray, PrimitiveArray};
52/// # use vortex_array::validity::Validity;
53/// # use vortex_array::IntoArray;
54/// # use vortex_buffer::buffer;
55/// # use std::sync::Arc;
56/// #
57/// // Create a list view array representing [[3, 4], [1], [2, 3]].
58/// // Note: Unlike `ListArray`, offsets don't need to be monotonic.
59///
60/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
61/// let offsets = buffer![2u32, 0, 1].into_array();  // Out-of-order offsets
62/// let sizes = buffer![2u32, 1, 2].into_array();  // The sizes cause overlaps
63///
64/// let list_view = ListViewArray::new(
65///     elements.into_array(),
66///     offsets.into_array(),
67///     sizes.into_array(),
68///     Validity::NonNullable,
69/// );
70///
71/// assert_eq!(list_view.len(), 3);
72///
73/// // Access individual lists
74/// let first_list = list_view.list_elements_at(0)?;
75/// assert_eq!(first_list.len(), 2);
76/// // First list contains elements[2..4] = [3, 4]
77///
78/// let first_offset = list_view.offset_at(0);
79/// let first_size = list_view.size_at(0);
80/// assert_eq!(first_offset, 2);
81/// assert_eq!(first_size, 2);
82/// # Ok(())
83/// # }
84/// ```
85///
86/// [`ListArray`]: crate::arrays::ListArray
87#[derive(Clone, Debug)]
88pub struct ListViewArray {
89    /// The [`DType`] of the list array.
90    ///
91    /// This type **must** be the variant [`DType::List`].
92    pub(super) dtype: DType,
93
94    /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
95    /// each inner list element is a _scalar_ of the `elements` array.
96    elements: ArrayRef,
97
98    /// The `offsets` array indicating the start position of each list in elements.
99    ///
100    /// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
101    /// (which is different from [`ListArray`](crate::arrays::ListArray)),
102    offsets: ArrayRef,
103
104    /// The `sizes` array indicating the length of each list.
105    ///
106    /// This field is intended to be paired with a corresponding offset to determine the list scalar
107    /// we want to access.
108    sizes: ArrayRef,
109
110    // TODO(connor)[ListView]: Add the n+1 memory allocation optimization.
111    /// A flag denoting if the array is zero-copyable* to a [`ListArray`](crate::arrays::ListArray).
112    ///
113    /// We use this information to help us more efficiently rebuild / compact our data.
114    ///
115    /// When this flag is true (indicating sorted offsets with no gaps and no overlaps and all
116    /// `offsets[i] + sizes[i]` are in order), conversions can bypass the very expensive rebuild
117    /// process which must rebuild the array from scratch.
118    is_zero_copy_to_list: bool,
119
120    /// The validity / null map of the array.
121    ///
122    /// Note that this null map refers to which list scalars are null, **not** which sub-elements of
123    /// list scalars are null. The `elements` array will track individual value nullability.
124    pub(super) validity: Validity,
125
126    /// The stats for this array.
127    pub(super) stats_set: ArrayStats,
128}
129
130pub struct ListViewArrayParts {
131    pub elements_dtype: Arc<DType>,
132
133    /// See `ListViewArray::elements`
134    pub elements: ArrayRef,
135
136    /// See `ListViewArray::offsets`
137    pub offsets: ArrayRef,
138
139    /// See `ListViewArray::sizes`
140    pub sizes: ArrayRef,
141
142    /// See `ListViewArray::validity`
143    pub validity: Validity,
144}
145
146impl ListViewArray {
147    /// Creates a new [`ListViewArray`].
148    ///
149    /// # Panics
150    ///
151    /// Panics if the provided components do not satisfy the invariants documented
152    /// in [`ListViewArray::new_unchecked`].
153    pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
154        Self::try_new(elements, offsets, sizes, validity)
155            .vortex_expect("`ListViewArray` construction failed")
156    }
157
158    /// Constructs a new `ListViewArray`.
159    ///
160    /// # Errors
161    ///
162    /// Returns an error if the provided components do not satisfy the invariants documented
163    /// in [`ListViewArray::new_unchecked`].
164    pub fn try_new(
165        elements: ArrayRef,
166        offsets: ArrayRef,
167        sizes: ArrayRef,
168        validity: Validity,
169    ) -> VortexResult<Self> {
170        Self::validate(&elements, &offsets, &sizes, &validity)?;
171
172        Ok(Self {
173            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
174            elements,
175            offsets,
176            sizes,
177            validity,
178            is_zero_copy_to_list: false,
179            stats_set: Default::default(),
180        })
181    }
182
183    /// Creates a new [`ListViewArray`] without validation.
184    ///
185    /// This unsafe function does not check the validity of the data. Prefer calling [`new()`] or
186    /// [`try_new()`] over this function, as they will check the validity of the data.
187    ///
188    /// [`ListArray`]: crate::arrays::ListArray
189    /// [`new()`]: Self::new
190    /// [`try_new()`]: Self::try_new
191    ///
192    /// # Safety
193    ///
194    /// The caller must ensure all of the following invariants are satisfied:
195    ///
196    /// - `offsets` and `sizes` must be non-nullable integer arrays.
197    /// - `offsets` and `sizes` must have the same length.
198    /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
199    /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`
200    ///   (even if the corresponding view is defined as null by the validity array).
201    /// - If validity is an array, its length must equal `offsets.len()`.
202    pub unsafe fn new_unchecked(
203        elements: ArrayRef,
204        offsets: ArrayRef,
205        sizes: ArrayRef,
206        validity: Validity,
207    ) -> Self {
208        if cfg!(debug_assertions) {
209            Self::validate(&elements, &offsets, &sizes, &validity)
210                .vortex_expect("Failed to crate `ListViewArray`");
211        }
212
213        Self {
214            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
215            elements,
216            offsets,
217            sizes,
218            validity,
219            is_zero_copy_to_list: false,
220            stats_set: Default::default(),
221        }
222    }
223
224    /// Validates the components that would be used to create a [`ListViewArray`].
225    pub fn validate(
226        elements: &dyn Array,
227        offsets: &dyn Array,
228        sizes: &dyn Array,
229        validity: &Validity,
230    ) -> VortexResult<()> {
231        // Check that offsets and sizes are integer arrays and non-nullable.
232        vortex_ensure!(
233            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
234            "offsets must be non-nullable integer array, got {}",
235            offsets.dtype()
236        );
237        vortex_ensure!(
238            sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
239            "sizes must be non-nullable integer array, got {}",
240            sizes.dtype()
241        );
242
243        // Check that they have the same length.
244        vortex_ensure!(
245            offsets.len() == sizes.len(),
246            "offsets and sizes must have the same length, got {} and {}",
247            offsets.len(),
248            sizes.len()
249        );
250
251        // Check that the size type can fit within the offset type to prevent overflows.
252        let size_ptype = sizes.dtype().as_ptype();
253        let offset_ptype = offsets.dtype().as_ptype();
254
255        // If a validity array is present, it must be the same length as the `ListViewArray`.
256        if let Some(validity_len) = validity.maybe_len() {
257            vortex_ensure!(
258                validity_len == offsets.len(),
259                "validity with size {validity_len} does not match array size {}",
260                offsets.len()
261            );
262        }
263
264        // Skip host-only validation when offsets/sizes are not host-resident.
265        if offsets.is_host() && sizes.is_host() {
266            let offsets_primitive = offsets.to_primitive();
267            let sizes_primitive = sizes.to_primitive();
268
269            // Validate the `offsets` and `sizes` arrays.
270            match_each_integer_ptype!(offset_ptype, |O| {
271                match_each_integer_ptype!(size_ptype, |S| {
272                    let offsets_slice = offsets_primitive.as_slice::<O>();
273                    let sizes_slice = sizes_primitive.as_slice::<S>();
274
275                    validate_offsets_and_sizes::<O, S>(
276                        offsets_slice,
277                        sizes_slice,
278                        elements.len() as u64,
279                    )?;
280                })
281            });
282        }
283
284        Ok(())
285    }
286
287    /// Sets whether this [`ListViewArray`] is zero-copyable to a [`ListArray`].
288    ///
289    /// This is an optimization flag that enables more efficient conversion to [`ListArray`] without
290    /// needing to copy or reorganize the data.
291    ///
292    /// [`ListArray`]: crate::arrays::ListArray
293    ///
294    /// # Safety
295    ///
296    /// When setting `is_zctl` to `true`, the caller must ensure that the [`ListViewArray`] is
297    /// actually zero-copyable to a [`ListArray`]. This means:
298    ///
299    /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
300    /// - `offsets[i] + sizes[i] == offsets[i + 1]` for all `i`.
301    /// - No gaps in elements between first and last referenced elements.
302    /// - No overlapping list views (each element referenced at most once).
303    ///
304    /// Note that leading and trailing unreferenced elements **ARE** allowed.
305    pub unsafe fn with_zero_copy_to_list(mut self, is_zctl: bool) -> Self {
306        if cfg!(debug_assertions) && is_zctl {
307            validate_zctl(
308                &self.elements,
309                self.offsets.to_primitive(),
310                self.sizes.to_primitive(),
311            )
312            .vortex_expect("Failed to validate zero-copy to list flag");
313        }
314        self.is_zero_copy_to_list = is_zctl;
315        self
316    }
317
318    /// Verifies that the `ListViewArray` is zero-copyable to a [`ListArray`].
319    ///
320    /// This will run an expensive validation of the `ListViewArray`'s components. It will check the
321    /// following things:
322    ///
323    /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
324    /// - No gaps in elements between first and last referenced elements.
325    /// - No overlapping list views (each element referenced at most once).
326    ///
327    /// Note that leading and trailing unreferenced elements **ARE** allowed.
328    ///
329    /// This method should really only be called if the caller knows that the `ListViewArray` will
330    /// be converted into a [`ListArray`] in the future, and the caller wants to set the
331    /// optimization flag to `true` with the unsafe [`with_zero_copy_to_list`] method.
332    ///
333    /// [`ListArray`]: crate::arrays::ListArray
334    /// [`with_zero_copy_to_list`]: Self::with_zero_copy_to_list
335    pub fn verify_is_zero_copy_to_list(&self) -> bool {
336        validate_zctl(
337            &self.elements,
338            self.offsets.to_primitive(),
339            self.sizes.to_primitive(),
340        )
341        .is_ok()
342    }
343
344    pub fn into_parts(self) -> ListViewArrayParts {
345        let dtype = self.dtype.into_list_element_opt().vortex_expect("is list");
346        ListViewArrayParts {
347            elements_dtype: dtype,
348            elements: self.elements,
349            offsets: self.offsets,
350            sizes: self.sizes,
351            validity: self.validity,
352        }
353    }
354
355    /// Returns the offset at the given index.
356    ///
357    /// Note that it is possible the corresponding list view is null (which is only defined by the
358    /// validity map). Regardless, we are still guaranteed that this offset is valid by the
359    /// invariants of [`ListViewArray`].
360    pub fn offset_at(&self, index: usize) -> usize {
361        assert!(
362            index < self.len(),
363            "Index {index} out of bounds 0..{}",
364            self.len()
365        );
366
367        // Fast path for `PrimitiveArray`.
368        self.offsets
369            .as_opt::<PrimitiveVTable>()
370            .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
371            .unwrap_or_else(|| {
372                // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
373                self.offsets
374                    .scalar_at(index)
375                    .vortex_expect("offsets must support scalar_at")
376                    .as_primitive()
377                    .as_::<usize>()
378                    .vortex_expect("offset must fit in usize")
379            })
380    }
381
382    /// Returns the size at the given index.
383    ///
384    /// Note that it is possible the corresponding list view is null (which is only defined by the
385    /// validity map). Regardless, we are still guaranteed that this size is valid by the invariants
386    /// of [`ListViewArray`].
387    pub fn size_at(&self, index: usize) -> usize {
388        assert!(
389            index < self.len(),
390            "Index {} out of bounds 0..{}",
391            index,
392            self.len()
393        );
394
395        // Fast path for `PrimitiveArray`.
396        self.sizes
397            .as_opt::<PrimitiveVTable>()
398            .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
399            .unwrap_or_else(|| {
400                // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
401                self.sizes
402                    .scalar_at(index)
403                    .vortex_expect("sizes must support scalar_at")
404                    .as_primitive()
405                    .as_::<usize>()
406                    .vortex_expect("size must fit in usize")
407            })
408    }
409
410    /// Returns the elements at the given index from the list array.
411    ///
412    /// # Errors
413    ///
414    /// Returns an error if the slice operation fails.
415    pub fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
416        let offset = self.offset_at(index);
417        let size = self.size_at(index);
418        self.elements().slice(offset..offset + size)
419    }
420
421    /// Returns the offsets array.
422    pub fn offsets(&self) -> &ArrayRef {
423        &self.offsets
424    }
425
426    /// Returns the sizes array.
427    pub fn sizes(&self) -> &ArrayRef {
428        &self.sizes
429    }
430
431    /// Returns the elements array.
432    pub fn elements(&self) -> &ArrayRef {
433        &self.elements
434    }
435
436    /// Returns true if the `ListViewArray` is zero-copyable to a
437    /// [`ListArray`](crate::arrays::ListArray).
438    pub fn is_zero_copy_to_list(&self) -> bool {
439        self.is_zero_copy_to_list
440    }
441}
442
443/// Helper function to validate `offsets` and `sizes` with specific types.
444fn validate_offsets_and_sizes<O, S>(
445    offsets_slice: &[O],
446    sizes_slice: &[S],
447    elements_len: u64,
448) -> VortexResult<()>
449where
450    O: IntegerPType,
451    S: IntegerPType,
452{
453    debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
454
455    #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
456    for i in 0..offsets_slice.len() {
457        let offset = offsets_slice[i];
458        let size = sizes_slice[i];
459
460        vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
461        vortex_ensure!(size >= S::zero(), "cannot have negative size");
462
463        let offset_u64 = offset
464            .to_u64()
465            .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
466
467        let size_u64 = size
468            .to_u64()
469            .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
470
471        // Check for overflow when adding offset + size.
472        let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
473            vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
474        })?;
475
476        if offset_u64 == elements_len {
477            vortex_ensure!(
478                size_u64 == 0,
479                "views to the end of the elements array (length {elements_len}) must have size 0 \
480                    (had size {size_u64})"
481            );
482        }
483
484        vortex_ensure!(
485            end <= elements_len,
486            "offset[{i}] + size[{i}] = {offset_u64} + {size_u64} = {end} \
487            exceeds elements length {elements_len}",
488        );
489    }
490
491    Ok(())
492}
493
494/// Helper function to validate if the [`ListViewArray`] components are actually zero-copyable to
495/// [`ListArray`](crate::arrays::ListArray).
496fn validate_zctl(
497    elements: &dyn Array,
498    offsets_primitive: PrimitiveArray,
499    sizes_primitive: PrimitiveArray,
500) -> VortexResult<()> {
501    // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed), even
502    // if there are null views.
503    if let Some(is_sorted) = offsets_primitive.statistics().compute_is_sorted() {
504        vortex_ensure!(is_sorted, "offsets must be sorted");
505    } else {
506        vortex_bail!("offsets must report is_sorted statistic");
507    }
508
509    // Validate that offset[i] + size[i] <= offset[i+1] for all items
510    // This ensures views are non-overlapping and properly ordered for zero-copy-to-list
511    fn validate_monotonic_ends<O: IntegerPType, S: IntegerPType>(
512        offsets_slice: &[O],
513        sizes_slice: &[S],
514        len: usize,
515    ) -> VortexResult<()> {
516        let mut max_end = 0usize;
517
518        for i in 0..len {
519            let offset = offsets_slice[i].to_usize().unwrap_or(usize::MAX);
520            let size = sizes_slice[i].to_usize().unwrap_or(usize::MAX);
521
522            // Check that this view starts at or after the previous view ended
523            vortex_ensure!(
524                offset >= max_end,
525                "Zero-copy-to-list requires views to be non-overlapping and ordered: \
526                 view[{}] starts at {} but previous views extend to {}",
527                i,
528                offset,
529                max_end
530            );
531
532            // Update max_end for the next iteration
533            let end = offset.saturating_add(size);
534            max_end = max_end.max(end);
535        }
536
537        Ok(())
538    }
539
540    let offsets_dtype = offsets_primitive.dtype();
541    let sizes_dtype = sizes_primitive.dtype();
542    let len = offsets_primitive.len();
543
544    // Check that offset + size values are monotonic (no overlaps)
545    match_each_integer_ptype!(offsets_dtype.as_ptype(), |O| {
546        match_each_integer_ptype!(sizes_dtype.as_ptype(), |S| {
547            let offsets_slice = offsets_primitive.as_slice::<O>();
548            let sizes_slice = sizes_primitive.as_slice::<S>();
549
550            validate_monotonic_ends(offsets_slice, sizes_slice, len)?;
551        })
552    });
553
554    // TODO(connor)[ListView]: Making this allocation is expensive, but the more efficient
555    // implementation would be even more complicated than this. We could use a bit buffer denoting
556    // if positions in `elements` are used, and then additionally store a separate flag that tells
557    // us if a position is used more than once.
558    let mut element_references = vec![0u8; elements.len()];
559
560    fn count_references<O: IntegerPType, S: IntegerPType>(
561        element_references: &mut [u8],
562        offsets_primitive: PrimitiveArray,
563        sizes_primitive: PrimitiveArray,
564    ) {
565        let offsets_slice = offsets_primitive.as_slice::<O>();
566        let sizes_slice = sizes_primitive.as_slice::<S>();
567
568        // Note that we ignore nulls here, as the "null" view metadata must still maintain the same
569        // invariants as non-null views, even for a `bool` information.
570        for i in 0..offsets_slice.len() {
571            let offset: usize = offsets_slice[i].as_();
572            let size: usize = sizes_slice[i].as_();
573            for j in offset..offset + size {
574                element_references[j] = element_references[j].saturating_add(1);
575            }
576        }
577    }
578
579    match_each_integer_ptype!(offsets_primitive.ptype(), |O| {
580        match_each_integer_ptype!(sizes_primitive.ptype(), |S| {
581            count_references::<O, S>(&mut element_references, offsets_primitive, sizes_primitive);
582        })
583    });
584
585    // Allow leading and trailing unreferenced elements, but not gaps in the middle.
586    let leftmost_used = element_references
587        .iter()
588        .position(|&references| references != 0);
589    let rightmost_used = element_references
590        .iter()
591        .rposition(|&references| references != 0);
592
593    if let (Some(first_ref), Some(last_ref)) = (leftmost_used, rightmost_used) {
594        vortex_ensure!(
595            element_references[first_ref..=last_ref]
596                .iter()
597                .all(|&references| references != 0),
598            "found gap in elements array between first and last referenced elements"
599        );
600    }
601
602    vortex_ensure!(element_references.iter().all(|&references| references <= 1));
603
604    Ok(())
605}