vortex_array/arrays/listview/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::{DType, IntegerPType, match_each_integer_ptype};
8use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure, vortex_err};
9
10use crate::arrays::{PrimitiveArray, PrimitiveVTable, bool};
11use crate::stats::ArrayStats;
12use crate::validity::Validity;
13use crate::{Array, ArrayRef, ToCanonical};
14
15/// The canonical encoding for variable-length list arrays.
16///
17/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
18/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
19///
20/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
21/// since migrated to `ListViewArray` for a few reasons:
22///
23/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
24/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
25/// - Supports different integer types for offsets vs sizes
26///
27/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
28/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
29/// and size in a row-major fashion rather than column-major. More specifically, the row-major
30/// layout has a single child array with alternating offset and size next to each other.
31///
32/// We choose the column-major layout as it allows better compressability, as well as using
33/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
34/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
35/// about flexibility in the logcial type).
36///
37/// # Examples
38///
39/// ```
40/// # use vortex_array::arrays::{ListViewArray, PrimitiveArray};
41/// # use vortex_array::validity::Validity;
42/// # use vortex_array::IntoArray;
43/// # use vortex_buffer::buffer;
44/// # use std::sync::Arc;
45/// #
46/// // Create a list view array representing [[3, 4], [1], [2, 3]].
47/// // Note: Unlike `ListArray`, offsets don't need to be monotonic.
48///
49/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
50/// let offsets = buffer![2u32, 0, 1].into_array();  // Out-of-order offsets
51/// let sizes = buffer![2u32, 1, 2].into_array();  // The sizes cause overlaps
52///
53/// let list_view = ListViewArray::new(
54///     elements.into_array(),
55///     offsets.into_array(),
56///     sizes.into_array(),
57///     Validity::NonNullable,
58/// );
59///
60/// assert_eq!(list_view.len(), 3);
61///
62/// // Access individual lists
63/// let first_list = list_view.list_elements_at(0);
64/// assert_eq!(first_list.len(), 2);
65/// // First list contains elements[2..4] = [3, 4]
66///
67/// let first_offset = list_view.offset_at(0);
68/// let first_size = list_view.size_at(0);
69/// assert_eq!(first_offset, 2);
70/// assert_eq!(first_size, 2);
71/// ```
72///
73/// [`ListArray`]: crate::arrays::ListArray
74#[derive(Clone, Debug)]
75pub struct ListViewArray {
76    /// The [`DType`] of the list array.
77    ///
78    /// This type **must** be the variant [`DType::List`].
79    pub(super) dtype: DType,
80
81    /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
82    /// each inner list element is a _scalar_ of the `elements` array.
83    elements: ArrayRef,
84
85    /// The `offsets` array indicating the start position of each list in elements.
86    ///
87    /// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
88    /// (which is different from [`ListArray`](crate::arrays::ListArray)),
89    offsets: ArrayRef,
90
91    /// The `sizes` array indicating the length of each list.
92    ///
93    /// This field is intended to be paired with a corresponding offset to determine the list scalar
94    /// we want to access.
95    sizes: ArrayRef,
96
97    // TODO(connor)[ListView]: Add the n+1 memory allocation optimization.
98    /// A flag denoting if the array is zero-copyable* to a [`ListArray`](crate::arrays::ListArray).
99    ///
100    /// We use this information to help us more efficiently rebuild / compact our data.
101    ///
102    /// When this flag is true (indicating sorted offsets with no gaps and no overlaps), conversions
103    /// can bypass the very expensive rebuild process (which just calls `append_scalar` in a loop).
104    is_zero_copy_to_list: bool,
105
106    /// The validity / null map of the array.
107    ///
108    /// Note that this null map refers to which list scalars are null, **not** which sub-elements of
109    /// list scalars are null. The `elements` array will track individual value nullability.
110    pub(super) validity: Validity,
111
112    /// The stats for this array.
113    pub(super) stats_set: ArrayStats,
114}
115
116impl ListViewArray {
117    /// Creates a new [`ListViewArray`].
118    ///
119    /// # Panics
120    ///
121    /// Panics if the provided components do not satisfy the invariants documented
122    /// in [`ListViewArray::new_unchecked`].
123    pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
124        Self::try_new(elements, offsets, sizes, validity)
125            .vortex_expect("`ListViewArray` construction failed")
126    }
127
128    /// Constructs a new `ListViewArray`.
129    ///
130    /// # Errors
131    ///
132    /// Returns an error if the provided components do not satisfy the invariants documented
133    /// in [`ListViewArray::new_unchecked`].
134    pub fn try_new(
135        elements: ArrayRef,
136        offsets: ArrayRef,
137        sizes: ArrayRef,
138        validity: Validity,
139    ) -> VortexResult<Self> {
140        Self::validate(&elements, &offsets, &sizes, &validity)?;
141
142        Ok(Self {
143            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
144            elements,
145            offsets,
146            sizes,
147            validity,
148            is_zero_copy_to_list: false,
149            stats_set: Default::default(),
150        })
151    }
152
153    /// Creates a new [`ListViewArray`] without validation.
154    ///
155    /// This unsafe function does not check the validity of the data. Prefer calling [`new()`] or
156    /// [`try_new()`] over this function, as they will check the validity of the data.
157    ///
158    /// [`ListArray`]: crate::arrays::ListArray
159    /// [`new()`]: Self::new
160    /// [`try_new()`]: Self::try_new
161    ///
162    /// # Safety
163    ///
164    /// The caller must ensure all of the following invariants are satisfied:
165    ///
166    /// - `offsets` and `sizes` must be non-nullable integer arrays.
167    /// - `offsets` and `sizes` must have the same length.
168    /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
169    /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`
170    ///   (even if the corresponding view is defined as null by the validity array).
171    /// - If validity is an array, its length must equal `offsets.len()`.
172    pub unsafe fn new_unchecked(
173        elements: ArrayRef,
174        offsets: ArrayRef,
175        sizes: ArrayRef,
176        validity: Validity,
177    ) -> Self {
178        if cfg!(debug_assertions) {
179            Self::validate(&elements, &offsets, &sizes, &validity)
180                .vortex_expect("Failed to crate `ListViewArray`");
181        }
182
183        Self {
184            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
185            elements,
186            offsets,
187            sizes,
188            validity,
189            is_zero_copy_to_list: false,
190            stats_set: Default::default(),
191        }
192    }
193
194    /// Validates the components that would be used to create a [`ListViewArray`].
195    pub fn validate(
196        elements: &dyn Array,
197        offsets: &dyn Array,
198        sizes: &dyn Array,
199        validity: &Validity,
200    ) -> VortexResult<()> {
201        // Check that offsets and sizes are integer arrays and non-nullable.
202        vortex_ensure!(
203            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
204            "offsets must be non-nullable integer array, got {}",
205            offsets.dtype()
206        );
207        vortex_ensure!(
208            sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
209            "sizes must be non-nullable integer array, got {}",
210            sizes.dtype()
211        );
212
213        // Check that they have the same length.
214        vortex_ensure!(
215            offsets.len() == sizes.len(),
216            "offsets and sizes must have the same length, got {} and {}",
217            offsets.len(),
218            sizes.len()
219        );
220
221        // Check that the size type can fit within the offset type to prevent overflows.
222        let size_ptype = sizes.dtype().as_ptype();
223        let offset_ptype = offsets.dtype().as_ptype();
224
225        // If a validity array is present, it must be the same length as the `ListViewArray`.
226        if let Some(validity_len) = validity.maybe_len() {
227            vortex_ensure!(
228                validity_len == offsets.len(),
229                "validity with size {validity_len} does not match array size {}",
230                offsets.len()
231            );
232        }
233
234        let offsets_primitive = offsets.to_primitive();
235        let sizes_primitive = sizes.to_primitive();
236
237        // Validate the `offsets` and `sizes` arrays.
238        match_each_integer_ptype!(offset_ptype, |O| {
239            match_each_integer_ptype!(size_ptype, |S| {
240                let offsets_slice = offsets_primitive.as_slice::<O>();
241                let sizes_slice = sizes_primitive.as_slice::<S>();
242
243                validate_offsets_and_sizes::<O, S>(
244                    offsets_slice,
245                    sizes_slice,
246                    elements.len() as u64,
247                )?;
248            })
249        });
250
251        Ok(())
252    }
253
254    /// Sets whether this [`ListViewArray`] is zero-copyable to a [`ListArray`].
255    ///
256    /// This is an optimization flag that enables more efficient conversion to [`ListArray`] without
257    /// needing to copy or reorganize the data.
258    ///
259    /// [`ListArray`]: crate::arrays::ListArray
260    ///
261    /// # Safety
262    ///
263    /// When setting `is_zctl` to `true`, the caller must ensure that the [`ListViewArray`] is
264    /// actually zero-copyable to a [`ListArray`]. This means:
265    ///
266    /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
267    /// - No gaps in elements between first and last referenced elements.
268    /// - No overlapping list views (each element referenced at most once).
269    ///
270    /// Note that leading and trailing unreferenced elements **ARE** allowed.
271    pub unsafe fn with_zero_copy_to_list(mut self, is_zctl: bool) -> Self {
272        if cfg!(debug_assertions) && is_zctl {
273            validate_zctl(
274                &self.elements,
275                self.offsets.to_primitive(),
276                self.sizes.to_primitive(),
277            )
278            .vortex_expect("Failed to validate zero-copy to list flag");
279        }
280        self.is_zero_copy_to_list = is_zctl;
281        self
282    }
283
284    /// Verifies that the `ListViewArray` is zero-copyable to a [`ListArray`].
285    ///
286    /// This will run an expensive validation of the `ListViewArray`'s components. It will check the
287    /// following things:
288    ///
289    /// - Offsets must be sorted (but not strictly sorted, zero-length lists are allowed).
290    /// - No gaps in elements between first and last referenced elements.
291    /// - No overlapping list views (each element referenced at most once).
292    ///
293    /// Note that leading and trailing unreferenced elements **ARE** allowed.
294    ///
295    /// This method should really only be called if the caller knows that the `ListViewArray` will
296    /// be converted into a [`ListArray`] in the future, and the caller wants to set the
297    /// optimization flag to `true` with the unsafe [`with_zero_copy_to_list`] method.
298    ///
299    /// [`ListArray`]: crate::arrays::ListArray
300    /// [`with_zero_copy_to_list`]: Self::with_zero_copy_to_list
301    pub fn verify_is_zero_copy_to_list(&self) -> bool {
302        validate_zctl(
303            &self.elements,
304            self.offsets.to_primitive(),
305            self.sizes.to_primitive(),
306        )
307        .is_ok()
308    }
309
310    /// Returns the offset at the given index.
311    ///
312    /// Note that it is possible the corresponding list view is null (which is only defined by the
313    /// validity map). Regardless, we are still guaranteed that this offset is valid by the
314    /// invariants of [`ListViewArray`].
315    pub fn offset_at(&self, index: usize) -> usize {
316        assert!(
317            index < self.len(),
318            "Index {index} out of bounds 0..{}",
319            self.len()
320        );
321
322        // Fast path for `PrimitiveArray`.
323        self.offsets
324            .as_opt::<PrimitiveVTable>()
325            .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
326            .unwrap_or_else(|| {
327                // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
328                self.offsets
329                    .scalar_at(index)
330                    .as_primitive()
331                    .as_::<usize>()
332                    .vortex_expect("offset must fit in usize")
333            })
334    }
335
336    /// Returns the size at the given index.
337    ///
338    /// Note that it is possible the corresponding list view is null (which is only defined by the
339    /// validity map). Regardless, we are still guaranteed that this size is valid by the invariants
340    /// of [`ListViewArray`].
341    pub fn size_at(&self, index: usize) -> usize {
342        assert!(
343            index < self.len(),
344            "Index {} out of bounds 0..{}",
345            index,
346            self.len()
347        );
348
349        // Fast path for `PrimitiveArray`.
350        self.sizes
351            .as_opt::<PrimitiveVTable>()
352            .map(|p| match_each_integer_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
353            .unwrap_or_else(|| {
354                // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
355                self.sizes
356                    .scalar_at(index)
357                    .as_primitive()
358                    .as_::<usize>()
359                    .vortex_expect("size must fit in usize")
360            })
361    }
362
363    /// Returns the elements at the given index from the list array.
364    pub fn list_elements_at(&self, index: usize) -> ArrayRef {
365        let offset = self.offset_at(index);
366        let size = self.size_at(index);
367        self.elements().slice(offset..offset + size)
368    }
369
370    /// Returns the offsets array.
371    pub fn offsets(&self) -> &ArrayRef {
372        &self.offsets
373    }
374
375    /// Returns the sizes array.
376    pub fn sizes(&self) -> &ArrayRef {
377        &self.sizes
378    }
379
380    /// Returns the elements array.
381    pub fn elements(&self) -> &ArrayRef {
382        &self.elements
383    }
384
385    /// Returns true if the `ListViewArray` is zero-copyable to a
386    /// [`ListArray`](crate::arrays::ListArray).
387    pub fn is_zero_copy_to_list(&self) -> bool {
388        self.is_zero_copy_to_list
389    }
390}
391
392/// Helper function to validate `offsets` and `sizes` with specific types.
393fn validate_offsets_and_sizes<O, S>(
394    offsets_slice: &[O],
395    sizes_slice: &[S],
396    elements_len: u64,
397) -> VortexResult<()>
398where
399    O: IntegerPType,
400    S: IntegerPType,
401{
402    debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
403
404    #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
405    for i in 0..offsets_slice.len() {
406        let offset = offsets_slice[i];
407        let size = sizes_slice[i];
408
409        vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
410        vortex_ensure!(size >= S::zero(), "cannot have negative size");
411
412        let offset_u64 = offset
413            .to_u64()
414            .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
415
416        let size_u64 = size
417            .to_u64()
418            .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
419
420        // Check for overflow when adding offset + size.
421        let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
422            vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
423        })?;
424
425        if offset_u64 == elements_len {
426            vortex_ensure!(
427                size_u64 == 0,
428                "views to the end of the elements array (length {elements_len}) must have size 0"
429            );
430        }
431
432        vortex_ensure!(
433            end <= elements_len,
434            "offset[{i}] + size[{i}] = {offset_u64} + {size_u64} = {end} \
435            exceeds elements length {elements_len}",
436        );
437    }
438
439    Ok(())
440}
441
442/// Helper function to validate if the [`ListViewArray`] components are actually zero-copyable to
443/// [`ListArray`](crate::arrays::ListArray).
444fn validate_zctl(
445    elements: &dyn Array,
446    offsets_primitive: PrimitiveArray,
447    sizes_primitive: PrimitiveArray,
448) -> VortexResult<()> {
449    // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed), even
450    // if there are null views.
451    if let Some(is_sorted) = offsets_primitive.statistics().compute_is_sorted() {
452        vortex_ensure!(is_sorted, "offsets must be sorted");
453    } else {
454        vortex_bail!("offsets must report is_sorted statistic");
455    }
456
457    // TODO(connor)[ListView]: Making this allocation is expensive, but the more efficient
458    // implementation would be even more complicated than this. We could use a bit buffer denoting
459    // if positions in `elements` are used, and then additionally store a separate flag that tells
460    // us if a position is used more than once.
461    let mut element_references = vec![0u8; elements.len()];
462
463    fn count_references<O: IntegerPType, S: IntegerPType>(
464        element_references: &mut [u8],
465        offsets_primitive: PrimitiveArray,
466        sizes_primitive: PrimitiveArray,
467    ) {
468        let offsets_slice = offsets_primitive.as_slice::<O>();
469        let sizes_slice = sizes_primitive.as_slice::<S>();
470
471        // Note that we ignore nulls here, as the "null" view metadata must still maintain the same
472        // invariants as non-null views, even for a `bool` information.
473        for i in 0..offsets_slice.len() {
474            let offset: usize = offsets_slice[i].as_();
475            let size: usize = sizes_slice[i].as_();
476            for j in offset..offset + size {
477                element_references[j] = element_references[j].saturating_add(1);
478            }
479        }
480    }
481
482    match_each_integer_ptype!(offsets_primitive.ptype(), |O| {
483        match_each_integer_ptype!(sizes_primitive.ptype(), |S| {
484            count_references::<O, S>(&mut element_references, offsets_primitive, sizes_primitive);
485        })
486    });
487
488    // Allow leading and trailing unreferenced elements, but not gaps in the middle.
489    let leftmost_used = element_references
490        .iter()
491        .position(|&references| references != 0);
492    let rightmost_used = element_references
493        .iter()
494        .rposition(|&references| references != 0);
495
496    if let (Some(first_ref), Some(last_ref)) = (leftmost_used, rightmost_used) {
497        vortex_ensure!(
498            element_references[first_ref..=last_ref]
499                .iter()
500                .all(|&references| references != 0),
501            "found gap in elements array between first and last referenced elements"
502        );
503    }
504
505    vortex_ensure!(element_references.iter().all(|&references| references <= 1));
506
507    Ok(())
508}