Skip to main content

vortex_array/arrays/list/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9use vortex_error::vortex_bail;
10use vortex_error::vortex_ensure;
11use vortex_error::vortex_panic;
12
13use crate::Array;
14use crate::ArrayRef;
15use crate::IntoArray;
16use crate::arrays::ConstantArray;
17use crate::arrays::ListVTable;
18use crate::arrays::PrimitiveVTable;
19use crate::builtins::ArrayBuiltins;
20use crate::compute::min_max;
21use crate::dtype::DType;
22use crate::dtype::NativePType;
23use crate::match_each_integer_ptype;
24use crate::match_each_native_ptype;
25use crate::scalar_fn::fns::operators::Operator;
26use crate::stats::ArrayStats;
27use crate::validity::Validity;
28
29/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
30///
31/// This mirrors the Apache Arrow List array encoding and provides efficient storage
32/// for nested data where each row contains a list of elements of the same type.
33///
34/// ## Data Layout
35///
36/// The list array uses an offset-based encoding:
37/// - **Elements array**: A flat array containing all list elements concatenated together
38/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
39///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
40/// - **Validity**: Optional mask indicating which lists are null
41///
42/// This allows for excellent cascading compression of the elements and offsets, as similar values
43/// are clustered together and the offsets have a predictable pattern and small deltas between
44/// consecutive elements.
45///
46/// ## Offset Semantics
47///
48/// - Offsets must be non-nullable integers (i32, i64, etc.)
49/// - Offsets array has length `n+1` where `n` is the number of lists
50/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
51/// - Offsets must be monotonically increasing
52///
53/// # Examples
54///
55/// ```
56/// use vortex_array::arrays::{ListArray, PrimitiveArray};
57/// use vortex_array::validity::Validity;
58/// use vortex_array::IntoArray;
59/// use vortex_buffer::buffer;
60/// use std::sync::Arc;
61///
62/// // Create a list array representing [[1, 2], [3, 4, 5], []]
63/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
64/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
65///
66/// let list_array = ListArray::try_new(
67///     elements.into_array(),
68///     offsets.into_array(),
69///     Validity::NonNullable,
70/// ).unwrap();
71///
72/// assert_eq!(list_array.len(), 3);
73///
74/// // Access individual lists
75/// let first_list = list_array.list_elements_at(0).unwrap();
76/// assert_eq!(first_list.len(), 2); // [1, 2]
77///
78/// let third_list = list_array.list_elements_at(2).unwrap();
79/// assert!(third_list.is_empty()); // []
80/// ```
81#[derive(Clone, Debug)]
82pub struct ListArray {
83    pub(super) dtype: DType,
84    pub(super) elements: ArrayRef,
85    pub(super) offsets: ArrayRef,
86    pub(super) validity: Validity,
87    pub(super) stats_set: ArrayStats,
88}
89
90pub struct ListArrayParts {
91    pub dtype: DType,
92    pub elements: ArrayRef,
93    pub offsets: ArrayRef,
94    pub validity: Validity,
95}
96
97impl ListArray {
98    /// Creates a new [`ListArray`].
99    ///
100    /// # Panics
101    ///
102    /// Panics if the provided components do not satisfy the invariants documented
103    /// in [`ListArray::new_unchecked`].
104    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
105        Self::try_new(elements, offsets, validity).vortex_expect("ListArray new")
106    }
107
108    /// Constructs a new `ListArray`.
109    ///
110    /// See [`ListArray::new_unchecked`] for more information.
111    ///
112    /// # Errors
113    ///
114    /// Returns an error if the provided components do not satisfy the invariants documented in
115    /// [`ListArray::new_unchecked`].
116    pub fn try_new(
117        elements: ArrayRef,
118        offsets: ArrayRef,
119        validity: Validity,
120    ) -> VortexResult<Self> {
121        Self::validate(&elements, &offsets, &validity)?;
122
123        // SAFETY: validate ensures all invariants are met.
124        Ok(unsafe { Self::new_unchecked(elements, offsets, validity) })
125    }
126
127    /// Creates a new [`ListArray`] without validation from these components:
128    ///
129    /// * `elements` is a flat array containing all list elements concatenated.
130    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
131    /// * `validity` holds the null values.
132    ///
133    /// # Safety
134    ///
135    /// The caller must ensure all of the following invariants are satisfied:
136    ///
137    /// - Offsets must be a non-nullable integer array.
138    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
139    /// - Offsets must be sorted (monotonically increasing).
140    /// - All offset values must be non-negative.
141    /// - The maximum offset must not exceed `elements.len()`.
142    /// - If validity is an array, its length must equal `offsets.len() - 1`.
143    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
144        #[cfg(debug_assertions)]
145        Self::validate(&elements, &offsets, &validity)
146            .vortex_expect("[Debug Assertion]: Invalid `ListViewArray` parameters");
147
148        Self {
149            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
150            elements,
151            offsets,
152            validity,
153            stats_set: Default::default(),
154        }
155    }
156
157    /// Validates the components that would be used to create a [`ListArray`].
158    ///
159    /// This function checks all the invariants required by [`ListArray::new_unchecked`].
160    pub fn validate(
161        elements: &dyn Array,
162        offsets: &dyn Array,
163        validity: &Validity,
164    ) -> VortexResult<()> {
165        // Offsets must have at least one element
166        vortex_ensure!(
167            !offsets.is_empty(),
168            InvalidArgument: "Offsets must have at least one element, [0] for an empty list"
169        );
170
171        // Offsets must be of integer type, and cannot go lower than 0.
172        vortex_ensure!(
173            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
174            InvalidArgument: "offsets have invalid type {}",
175            offsets.dtype()
176        );
177
178        // We can safely unwrap the DType as primitive now
179        let offsets_ptype = offsets.dtype().as_ptype();
180
181        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
182        if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
183            vortex_ensure!(is_sorted, InvalidArgument: "offsets must be sorted");
184        } else {
185            vortex_bail!(InvalidArgument: "offsets must report is_sorted statistic");
186        }
187
188        // Validate that offsets min is non-negative, and max does not exceed the length of
189        // the elements array.
190        if let Some(min_max) = min_max(offsets)? {
191            match_each_integer_ptype!(offsets_ptype, |P| {
192                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
193                {
194                    let max = min_max
195                        .max
196                        .as_primitive()
197                        .as_::<P>()
198                        .vortex_expect("offsets type must fit offsets values");
199                    let min = min_max
200                        .min
201                        .as_primitive()
202                        .as_::<P>()
203                        .vortex_expect("offsets type must fit offsets values");
204
205                    vortex_ensure!(
206                        min >= 0,
207                        InvalidArgument: "offsets minimum {min} outside valid range [0, {max}]"
208                    );
209
210                    vortex_ensure!(
211                        max <= P::try_from(elements.len()).unwrap_or_else(|_| vortex_panic!(
212                            "Offsets type {} must be able to fit elements length {}",
213                            <P as NativePType>::PTYPE,
214                            elements.len()
215                        )),
216                        InvalidArgument: "Max offset {max} is beyond the length of the elements array {}",
217                        elements.len()
218                    );
219                }
220            })
221        } else {
222            // TODO(aduffy): fallback to slower validation pathway?
223            vortex_bail!(
224                InvalidArgument: "offsets array with encoding {} must support min_max compute function",
225                offsets.encoding_id()
226            );
227        };
228
229        // If a validity array is present, it must be the same length as the ListArray
230        if let Some(validity_len) = validity.maybe_len() {
231            vortex_ensure!(
232                validity_len == offsets.len() - 1,
233                InvalidArgument: "validity with size {validity_len} does not match array size {}",
234                offsets.len() - 1
235            );
236        }
237
238        Ok(())
239    }
240
241    /// Splits an array into its parts
242    pub fn into_parts(self) -> ListArrayParts {
243        ListArrayParts {
244            dtype: self.dtype,
245            elements: self.elements,
246            offsets: self.offsets,
247            validity: self.validity,
248        }
249    }
250
251    /// Returns the offset at the given index from the list array.
252    ///
253    /// Returns an error if the index is out of bounds or scalar_at fails.
254    pub fn offset_at(&self, index: usize) -> VortexResult<usize> {
255        vortex_ensure!(
256            index <= self.len(),
257            "Index {index} out of bounds 0..={}",
258            self.len()
259        );
260
261        if let Some(p) = self.offsets().as_opt::<PrimitiveVTable>() {
262            Ok(match_each_native_ptype!(p.ptype(), |P| {
263                p.as_slice::<P>()[index].as_()
264            }))
265        } else {
266            self.offsets()
267                .scalar_at(index)?
268                .as_primitive()
269                .as_::<usize>()
270                .ok_or_else(|| vortex_error::vortex_err!("offset value does not fit in usize"))
271        }
272    }
273
274    /// Returns the elements of the list scalar at the given index of the list array.
275    pub fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
276        let start = self.offset_at(index)?;
277        let end = self.offset_at(index + 1)?;
278        self.elements().slice(start..end)
279    }
280
281    /// Returns elements of the list array referenced by the offsets array.
282    ///
283    /// This is useful for discarding any potentially unused parts of the underlying `elements`
284    /// child array.
285    pub fn sliced_elements(&self) -> VortexResult<ArrayRef> {
286        let start = self.offset_at(0)?;
287        let end = self.offset_at(self.len())?;
288        self.elements().slice(start..end)
289    }
290
291    /// Returns the offsets array.
292    pub fn offsets(&self) -> &ArrayRef {
293        &self.offsets
294    }
295
296    /// Returns the element dtype of the list array.
297    pub fn element_dtype(&self) -> &Arc<DType> {
298        match &self.dtype {
299            DType::List(element_dtype, _) => element_dtype,
300            _ => vortex_panic!("ListArray has invalid dtype {}", self.dtype),
301        }
302    }
303
304    /// Returns the elements array.
305    pub fn elements(&self) -> &ArrayRef {
306        &self.elements
307    }
308
309    // TODO(connor)[ListView]: Create 2 functions `reset_offsets` and `recursive_reset_offsets`,
310    // where `reset_offsets` is infallible.
311    // Also, `reset_offsets` can be made more efficient by replacing `sub_scalar` with a match on
312    // the offset type and manual subtraction and fast path where `offsets[0] == 0`.
313
314    /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not
315    /// referenced by the `offsets`.
316    pub fn reset_offsets(&self, recurse: bool) -> VortexResult<Self> {
317        let mut elements = self.sliced_elements()?;
318        if recurse && elements.is_canonical() {
319            elements = elements.to_canonical()?.compact()?.into_array();
320        } else if recurse && let Some(child_list_array) = elements.as_opt::<ListVTable>() {
321            elements = child_list_array.reset_offsets(recurse)?.into_array();
322        }
323
324        let offsets = self.offsets();
325        let first_offset = offsets.scalar_at(0)?;
326        let adjusted_offsets = offsets.to_array().binary(
327            ConstantArray::new(first_offset, offsets.len()).into_array(),
328            Operator::Sub,
329        )?;
330
331        Self::try_new(elements, adjusted_offsets, self.validity.clone())
332    }
333}