vortex_array/arrays/listview/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::{AsPrimitive, Zero};
7use vortex_dtype::{
8    DType, NativePType, Nullability, match_each_integer_ptype, match_each_native_ptype,
9};
10use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_err};
11
12use crate::arrays::{ListArray, PrimitiveVTable};
13use crate::builders::PrimitiveBuilder;
14use crate::stats::ArrayStats;
15use crate::validity::Validity;
16use crate::vtable::ValidityHelper;
17use crate::{Array, ArrayRef, Canonical, IntoArray, ToCanonical};
18
19mod vtable;
20pub use vtable::{ListViewEncoding, ListViewVTable};
21
22#[cfg(test)]
23mod tests;
24
25// TODO(connor)[ListView]: Add compute functions
26// mod compute;
27
28/// The canonical encoding for variable-length list arrays.
29///
30/// The `ListViewArray` encoding differs from [`ListArray`] in that it stores a child `sizes` array
31/// in addition to a child `offsets` array (which is the _only_ child in [`ListArray`]).
32///
33/// In the past, we used [`ListArray`] as the canonical encoding for [`DType::List`], but we have
34/// since migrated to `ListViewArray` for a few reasons:
35///
36/// - Enables better SIMD vectorization (no sequential dependency when reading `offsets`)
37/// - Allows out-of-order offsets for better compression (we can shuffle the buffers)
38/// - Supports different integer types for offsets vs sizes
39///
40/// It is worth mentioning that this encoding mirrors Apache Arrow's `ListView` array type, but does
41/// not exactly mirror the similar type found in DuckDB and Velox, which stores the pair of offset
42/// and size in a row-major fashion rather than column-major. More specifically, the row-major
43/// layout has a single child array with alternating offset and size next to each other.
44///
45/// We choose the column-major layout as it allows better compressability, as well as using
46/// different (logical) integer widths for our `offsets` and `sizes` buffers (note that the
47/// compressor will likely compress to a different bit-packed width, but this is speaking strictly
48/// about flexibility in the logcial type).
49///
50/// # Examples
51///
52/// ```
53/// use vortex_array::arrays::{ListViewArray, PrimitiveArray};
54/// use vortex_array::validity::Validity;
55/// use vortex_array::IntoArray;
56/// use vortex_buffer::buffer;
57/// use std::sync::Arc;
58///
59/// // Create a list view array representing [[3, 4], [1], [2, 5]]
60/// // Note: Unlike ListArray, offsets don't need to be monotonic
61/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
62/// let offsets = buffer![2u32, 0, 1].into_array();  // Out-of-order offsets
63/// let sizes = buffer![2u32, 1, 2].into_array();  // Corresponding sizes
64///
65/// let list_view = ListViewArray::try_new(
66///     elements.into_array(),
67///     offsets.into_array(),
68///     sizes.into_array(),
69///     Validity::NonNullable,
70/// ).unwrap();
71///
72/// assert_eq!(list_view.len(), 3);
73///
74/// // Access individual lists
75/// let first_list = list_view.list_elements_at(0);
76/// assert_eq!(first_list.len(), 2);
77/// // First list contains elements[2..4] = [3, 4]
78///
79/// let first_offset = list_view.offset_at(0);
80/// let first_size = list_view.size_at(0);
81/// assert_eq!(first_offset, 2);
82/// assert_eq!(first_size, 2);
83/// ```
84///
85/// [`ListArray`]: crate::arrays::ListArray
86#[derive(Clone, Debug)]
87pub struct ListViewArray {
88    /// The [`DType`] of the list array.
89    ///
90    /// This type **must** be the variant [`DType::List`].
91    dtype: DType,
92
93    /// The `elements` data array, where each list scalar is a _slice_ of the `elements` array, and
94    /// each inner list element is a _scalar_ of the `elements` array.
95    elements: ArrayRef,
96
97    /// The `offsets` array indicating the start position of each list in elements.
98    ///
99    /// Since we also store `sizes`, this `offsets` field is allowed to be stored out-of-order
100    /// (which is different from [`ListArray`](crate::arrays::ListArray)),
101    offsets: ArrayRef,
102
103    /// The `sizes` array indicating the length of each list.
104    ///
105    /// This field is intended to be paired with a corresponding offset to determine the list scalar
106    /// we want to access.
107    sizes: ArrayRef,
108
109    /// The validity / null map of the array.
110    ///
111    /// Note that this null map refers to which list scalars are null, **not** which sub-elements of
112    /// list scalars are null. The `elements` array will track individual value nullability.
113    validity: Validity,
114
115    /// The stats for this array.
116    stats_set: ArrayStats,
117}
118
119impl ListViewArray {
120    /// Get the length of the array.
121    pub fn len(&self) -> usize {
122        debug_assert_eq!(self.offsets.len(), self.sizes.len());
123        self.offsets.len()
124    }
125
126    /// Check if the array is empty.
127    pub fn is_empty(&self) -> bool {
128        self.len() == 0
129    }
130
131    /// Creates a new [`ListViewArray`].
132    ///
133    /// # Panics
134    ///
135    /// Panics if the provided components do not satisfy the invariants documented
136    /// in [`ListViewArray::new_unchecked`].
137    pub fn new(elements: ArrayRef, offsets: ArrayRef, sizes: ArrayRef, validity: Validity) -> Self {
138        Self::try_new(elements, offsets, sizes, validity)
139            .vortex_expect("ListViewArray construction failed")
140    }
141
142    /// Constructs a new `ListViewArray`.
143    ///
144    /// # Errors
145    ///
146    /// Returns an error if the provided components do not satisfy the invariants.
147    pub fn try_new(
148        elements: ArrayRef,
149        offsets: ArrayRef,
150        sizes: ArrayRef,
151        validity: Validity,
152    ) -> VortexResult<Self> {
153        Self::validate(&elements, &offsets, &sizes, &validity)?;
154
155        // SAFETY: validate ensures all invariants are met.
156        Ok(unsafe { Self::new_unchecked(elements, offsets, sizes, validity) })
157    }
158
159    /// Creates a new [`ListViewArray`] without validation.
160    ///
161    /// # Safety
162    ///
163    /// The caller must ensure all of the following invariants are satisfied:
164    ///
165    /// - `offsets` and `sizes` must be non-nullable integer arrays.
166    /// - `offsets` and `sizes` must have the same length.
167    /// - Size integer width must be smaller than or equal to offset type (to prevent overflow).
168    /// - For each `i`, `offsets[i] + sizes[i]` must not overflow and must be `<= elements.len()`.
169    /// - If validity is an array, its length must equal `offsets.len()`.
170    pub unsafe fn new_unchecked(
171        elements: ArrayRef,
172        offsets: ArrayRef,
173        sizes: ArrayRef,
174        validity: Validity,
175    ) -> Self {
176        Self {
177            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
178            elements,
179            offsets,
180            sizes,
181            validity,
182            stats_set: Default::default(),
183        }
184    }
185
186    /// Validates the components that would be used to create a [`ListViewArray`].
187    pub(crate) fn validate(
188        elements: &dyn Array,
189        offsets: &dyn Array,
190        sizes: &dyn Array,
191        validity: &Validity,
192    ) -> VortexResult<()> {
193        // Check that offsets and sizes are integer arrays and non-nullable.
194        vortex_ensure!(
195            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
196            "offsets must be non-nullable integer array, got {}",
197            offsets.dtype()
198        );
199        vortex_ensure!(
200            sizes.dtype().is_int() && !sizes.dtype().is_nullable(),
201            "sizes must be non-nullable integer array, got {}",
202            sizes.dtype()
203        );
204
205        // Check that they have the same length.
206        vortex_ensure!(
207            offsets.len() == sizes.len(),
208            "offsets and sizes must have the same length, got {} and {}",
209            offsets.len(),
210            sizes.len()
211        );
212
213        // Check that the size type can fit within the offset type to prevent overflows.
214        let offset_ptype = offsets.dtype().as_ptype();
215        let size_ptype = sizes.dtype().as_ptype();
216        vortex_ensure!(
217            size_ptype.byte_width() <= offset_ptype.byte_width(),
218            "size type {:?} must fit within offset type {:?}",
219            size_ptype,
220            offset_ptype
221        );
222
223        // Validate the `offsets` and `sizes` arrays.
224        match_each_integer_ptype!(offset_ptype, |O| {
225            match_each_integer_ptype!(size_ptype, |S| {
226                let offsets_primitive = offsets.to_primitive();
227                let sizes_primitive = sizes.to_primitive();
228
229                let offsets_slice = offsets_primitive.as_slice::<O>();
230                let sizes_slice = sizes_primitive.as_slice::<S>();
231
232                validate_offsets_and_sizes::<O, S>(
233                    offsets_slice,
234                    sizes_slice,
235                    elements.len() as u64,
236                )?;
237            })
238        });
239
240        // If a validity array is present, it must be the same length as the ListView.
241        if let Some(validity_len) = validity.maybe_len() {
242            vortex_ensure!(
243                validity_len == offsets.len(),
244                "validity with size {validity_len} does not match array size {}",
245                offsets.len()
246            );
247        }
248
249        Ok(())
250    }
251
252    /// Returns the offset at the given index.
253    pub fn offset_at(&self, index: usize) -> usize {
254        assert!(
255            index < self.len(),
256            "Index {index} out of bounds 0..{}",
257            self.len()
258        );
259
260        // Fast path for `PrimitiveArray`.
261        self.offsets
262            .as_opt::<PrimitiveVTable>()
263            .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
264            .unwrap_or_else(|| {
265                // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
266                self.offsets
267                    .scalar_at(index)
268                    .as_primitive()
269                    .as_::<usize>()
270                    .vortex_expect("offset must fit in usize")
271            })
272    }
273
274    /// Returns the size at the given index.
275    pub fn size_at(&self, index: usize) -> usize {
276        assert!(
277            index < self.len(),
278            "Index {} out of bounds 0..{}",
279            index,
280            self.len()
281        );
282
283        // Fast path for `PrimitiveArray`.
284        self.sizes
285            .as_opt::<PrimitiveVTable>()
286            .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
287            .unwrap_or_else(|| {
288                // Slow path: use `scalar_at` if we can't downcast directly to `PrimitiveArray`.
289                self.sizes
290                    .scalar_at(index)
291                    .as_primitive()
292                    .as_::<usize>()
293                    .vortex_expect("size must fit in usize")
294            })
295    }
296
297    /// Returns the elements at the given index from the list array.
298    pub fn list_elements_at(&self, index: usize) -> ArrayRef {
299        let offset = self.offset_at(index);
300        let size = self.size_at(index);
301        self.elements().slice(offset..offset + size)
302    }
303
304    /// Returns the offsets array.
305    pub fn offsets(&self) -> &ArrayRef {
306        &self.offsets
307    }
308
309    /// Returns the sizes array.
310    pub fn sizes(&self) -> &ArrayRef {
311        &self.sizes
312    }
313
314    /// Returns the elements array.
315    pub fn elements(&self) -> &ArrayRef {
316        &self.elements
317    }
318}
319
320/// Helper function to validate `offsets` and `sizes` with specific types.
321fn validate_offsets_and_sizes<O, S>(
322    offsets_slice: &[O],
323    sizes_slice: &[S],
324    elements_len: u64,
325) -> VortexResult<()>
326where
327    O: NativePType + PartialOrd + Zero,
328    S: NativePType + PartialOrd + Zero,
329{
330    debug_assert_eq!(offsets_slice.len(), sizes_slice.len());
331
332    #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
333    for i in 0..offsets_slice.len() {
334        let offset = offsets_slice[i];
335        let size = sizes_slice[i];
336
337        vortex_ensure!(offset >= O::zero(), "cannot have negative offsets");
338        vortex_ensure!(size >= S::zero(), "cannot have negative size");
339
340        let offset_u64 = offset
341            .to_u64()
342            .ok_or_else(|| vortex_err!("offset[{i}] = {offset:?} cannot be converted to u64"))?;
343
344        let size_u64 = size
345            .to_u64()
346            .ok_or_else(|| vortex_err!("size[{i}] = {size:?} cannot be converted to u64"))?;
347
348        // Check for overflow when adding offset + size.
349        let end = offset_u64.checked_add(size_u64).ok_or_else(|| {
350            vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
351        })?;
352
353        vortex_ensure!(
354            end <= elements_len,
355            "offset[{i}] + size[{i}] = {end} exceeds elements length {elements_len}",
356        );
357    }
358
359    Ok(())
360}
361
362/// Create a [`ListViewArray`] from a [`ListArray`](crate::arrays::ListArray) by computing `sizes`
363/// from `offsets`.
364pub fn list_view_from_list(list: ListArray) -> ListViewArray {
365    // TODO(connor)[ListView]: Create a version of `Canonical::empty` for `ListView`. It might
366    // also be worth specializing that for all canonical encodings.
367    // If the list is empty, create an empty `ListView` with the same offset dtype as the input.
368    if list.is_empty() {
369        let empty_offsets = Canonical::empty(list.offsets().dtype()).into_array();
370        let empty_sizes = Canonical::empty(list.offsets().dtype()).into_array();
371        let empty_validity = list.validity().clone();
372
373        // SAFETY: Everything is empty so all the variants are satisfied.
374        return unsafe {
375            ListViewArray::new_unchecked(
376                list.elements().clone(),
377                empty_offsets,
378                empty_sizes,
379                empty_validity,
380            )
381        };
382    }
383
384    let len = list.len();
385
386    // Get the `offsets` array directly from the `ListArray` (preserving its type).
387    let list_offsets = list.offsets().clone();
388
389    // We need to slice the `offsets` to remove the last element (`ListArray` has n+1 offsets).
390    let adjusted_offsets = list_offsets.slice(0..len);
391
392    // Create sizes array by computing differences between consecutive offsets.
393    // Use the same dtype as the offsets array to ensure compatibility.
394    let sizes = match_each_integer_ptype!(list_offsets.dtype().as_ptype(), |P| {
395        let mut sizes_builder = PrimitiveBuilder::<P>::with_capacity(Nullability::NonNullable, len);
396
397        // Create uninit range for direct memory access.
398        let mut sizes_range = sizes_builder.uninit_range(len);
399
400        // Compute sizes as the difference between consecutive offsets.
401        for i in 0..len {
402            let start = list.offset_at(i);
403            let end = list.offset_at(i + 1);
404            let size = end - start;
405
406            // Set size value directly without creating scalar.
407            sizes_range.set_value(
408                i,
409                P::try_from(size).vortex_expect("size must fit in offset type"),
410            );
411        }
412
413        // SAFETY: We have initialized all values in the range.
414        unsafe {
415            sizes_range.finish();
416        }
417
418        sizes_builder.finish_into_primitive().into_array()
419    });
420
421    // SAFETY: Since everything came from an existing valid `ListArray`, and the `sizes` were
422    // derived from valid and in-order `offsets`, we know these fields are valid.
423    unsafe {
424        ListViewArray::new_unchecked(
425            list.elements().clone(),
426            adjusted_offsets,
427            sizes,
428            list.validity().clone(),
429        )
430    }
431}