Skip to main content

vortex_array/arrays/list/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::DType;
8use vortex_dtype::NativePType;
9use vortex_dtype::match_each_integer_ptype;
10use vortex_dtype::match_each_native_ptype;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_error::vortex_ensure;
15use vortex_error::vortex_panic;
16
17use crate::Array;
18use crate::ArrayRef;
19use crate::IntoArray;
20use crate::arrays::ListVTable;
21use crate::arrays::PrimitiveVTable;
22use crate::compute::min_max;
23use crate::compute::sub_scalar;
24use crate::stats::ArrayStats;
25use crate::validity::Validity;
26
27/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
28///
29/// This mirrors the Apache Arrow List array encoding and provides efficient storage
30/// for nested data where each row contains a list of elements of the same type.
31///
32/// ## Data Layout
33///
34/// The list array uses an offset-based encoding:
35/// - **Elements array**: A flat array containing all list elements concatenated together
36/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
37///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
38/// - **Validity**: Optional mask indicating which lists are null
39///
40/// This allows for excellent cascading compression of the elements and offsets, as similar values
41/// are clustered together and the offsets have a predictable pattern and small deltas between
42/// consecutive elements.
43///
44/// ## Offset Semantics
45///
46/// - Offsets must be non-nullable integers (i32, i64, etc.)
47/// - Offsets array has length `n+1` where `n` is the number of lists
48/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
49/// - Offsets must be monotonically increasing
50///
51/// # Examples
52///
53/// ```
54/// use vortex_array::arrays::{ListArray, PrimitiveArray};
55/// use vortex_array::validity::Validity;
56/// use vortex_array::IntoArray;
57/// use vortex_buffer::buffer;
58/// use std::sync::Arc;
59///
60/// // Create a list array representing [[1, 2], [3, 4, 5], []]
61/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
62/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
63///
64/// let list_array = ListArray::try_new(
65///     elements.into_array(),
66///     offsets.into_array(),
67///     Validity::NonNullable,
68/// ).unwrap();
69///
70/// assert_eq!(list_array.len(), 3);
71///
72/// // Access individual lists
73/// let first_list = list_array.list_elements_at(0).unwrap();
74/// assert_eq!(first_list.len(), 2); // [1, 2]
75///
76/// let third_list = list_array.list_elements_at(2).unwrap();
77/// assert!(third_list.is_empty()); // []
78/// ```
79#[derive(Clone, Debug)]
80pub struct ListArray {
81    pub(super) dtype: DType,
82    pub(super) elements: ArrayRef,
83    pub(super) offsets: ArrayRef,
84    pub(super) validity: Validity,
85    pub(super) stats_set: ArrayStats,
86}
87
88pub struct ListArrayParts {
89    pub dtype: DType,
90    pub elements: ArrayRef,
91    pub offsets: ArrayRef,
92    pub validity: Validity,
93}
94
95impl ListArray {
96    /// Creates a new [`ListArray`].
97    ///
98    /// # Panics
99    ///
100    /// Panics if the provided components do not satisfy the invariants documented
101    /// in [`ListArray::new_unchecked`].
102    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
103        Self::try_new(elements, offsets, validity).vortex_expect("ListArray new")
104    }
105
106    /// Constructs a new `ListArray`.
107    ///
108    /// See [`ListArray::new_unchecked`] for more information.
109    ///
110    /// # Errors
111    ///
112    /// Returns an error if the provided components do not satisfy the invariants documented in
113    /// [`ListArray::new_unchecked`].
114    pub fn try_new(
115        elements: ArrayRef,
116        offsets: ArrayRef,
117        validity: Validity,
118    ) -> VortexResult<Self> {
119        Self::validate(&elements, &offsets, &validity)?;
120
121        // SAFETY: validate ensures all invariants are met.
122        Ok(unsafe { Self::new_unchecked(elements, offsets, validity) })
123    }
124
125    /// Creates a new [`ListArray`] without validation from these components:
126    ///
127    /// * `elements` is a flat array containing all list elements concatenated.
128    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
129    /// * `validity` holds the null values.
130    ///
131    /// # Safety
132    ///
133    /// The caller must ensure all of the following invariants are satisfied:
134    ///
135    /// - Offsets must be a non-nullable integer array.
136    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
137    /// - Offsets must be sorted (monotonically increasing).
138    /// - All offset values must be non-negative.
139    /// - The maximum offset must not exceed `elements.len()`.
140    /// - If validity is an array, its length must equal `offsets.len() - 1`.
141    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
142        #[cfg(debug_assertions)]
143        Self::validate(&elements, &offsets, &validity)
144            .vortex_expect("[Debug Assertion]: Invalid `ListViewArray` parameters");
145
146        Self {
147            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
148            elements,
149            offsets,
150            validity,
151            stats_set: Default::default(),
152        }
153    }
154
155    /// Validates the components that would be used to create a [`ListArray`].
156    ///
157    /// This function checks all the invariants required by [`ListArray::new_unchecked`].
158    pub fn validate(
159        elements: &dyn Array,
160        offsets: &dyn Array,
161        validity: &Validity,
162    ) -> VortexResult<()> {
163        // Offsets must have at least one element
164        vortex_ensure!(
165            !offsets.is_empty(),
166            InvalidArgument: "Offsets must have at least one element, [0] for an empty list"
167        );
168
169        // Offsets must be of integer type, and cannot go lower than 0.
170        vortex_ensure!(
171            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
172            InvalidArgument: "offsets have invalid type {}",
173            offsets.dtype()
174        );
175
176        // We can safely unwrap the DType as primitive now
177        let offsets_ptype = offsets.dtype().as_ptype();
178
179        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
180        if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
181            vortex_ensure!(is_sorted, InvalidArgument: "offsets must be sorted");
182        } else {
183            vortex_bail!(InvalidArgument: "offsets must report is_sorted statistic");
184        }
185
186        // Validate that offsets min is non-negative, and max does not exceed the length of
187        // the elements array.
188        if let Some(min_max) = min_max(offsets)? {
189            match_each_integer_ptype!(offsets_ptype, |P| {
190                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
191                {
192                    let max = min_max
193                        .max
194                        .as_primitive()
195                        .as_::<P>()
196                        .vortex_expect("offsets type must fit offsets values");
197                    let min = min_max
198                        .min
199                        .as_primitive()
200                        .as_::<P>()
201                        .vortex_expect("offsets type must fit offsets values");
202
203                    vortex_ensure!(
204                        min >= 0,
205                        InvalidArgument: "offsets minimum {min} outside valid range [0, {max}]"
206                    );
207
208                    vortex_ensure!(
209                        max <= P::try_from(elements.len()).unwrap_or_else(|_| vortex_panic!(
210                            "Offsets type {} must be able to fit elements length {}",
211                            <P as NativePType>::PTYPE,
212                            elements.len()
213                        )),
214                        InvalidArgument: "Max offset {max} is beyond the length of the elements array {}",
215                        elements.len()
216                    );
217                }
218            })
219        } else {
220            // TODO(aduffy): fallback to slower validation pathway?
221            vortex_bail!(
222                InvalidArgument: "offsets array with encoding {} must support min_max compute function",
223                offsets.encoding_id()
224            );
225        };
226
227        // If a validity array is present, it must be the same length as the ListArray
228        if let Some(validity_len) = validity.maybe_len() {
229            vortex_ensure!(
230                validity_len == offsets.len() - 1,
231                InvalidArgument: "validity with size {validity_len} does not match array size {}",
232                offsets.len() - 1
233            );
234        }
235
236        Ok(())
237    }
238
239    /// Splits an array into its parts
240    pub fn into_parts(self) -> ListArrayParts {
241        ListArrayParts {
242            dtype: self.dtype,
243            elements: self.elements,
244            offsets: self.offsets,
245            validity: self.validity,
246        }
247    }
248
249    /// Returns the offset at the given index from the list array.
250    ///
251    /// Returns an error if the index is out of bounds or scalar_at fails.
252    pub fn offset_at(&self, index: usize) -> VortexResult<usize> {
253        vortex_ensure!(
254            index <= self.len(),
255            "Index {index} out of bounds 0..={}",
256            self.len()
257        );
258
259        if let Some(p) = self.offsets().as_opt::<PrimitiveVTable>() {
260            Ok(match_each_native_ptype!(p.ptype(), |P| {
261                p.as_slice::<P>()[index].as_()
262            }))
263        } else {
264            self.offsets()
265                .scalar_at(index)?
266                .as_primitive()
267                .as_::<usize>()
268                .ok_or_else(|| vortex_error::vortex_err!("offset value does not fit in usize"))
269        }
270    }
271
272    /// Returns the elements of the list scalar at the given index of the list array.
273    pub fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
274        let start = self.offset_at(index)?;
275        let end = self.offset_at(index + 1)?;
276        self.elements().slice(start..end)
277    }
278
279    /// Returns elements of the list array referenced by the offsets array.
280    ///
281    /// This is useful for discarding any potentially unused parts of the underlying `elements`
282    /// child array.
283    pub fn sliced_elements(&self) -> VortexResult<ArrayRef> {
284        let start = self.offset_at(0)?;
285        let end = self.offset_at(self.len())?;
286        self.elements().slice(start..end)
287    }
288
289    /// Returns the offsets array.
290    pub fn offsets(&self) -> &ArrayRef {
291        &self.offsets
292    }
293
294    /// Returns the element dtype of the list array.
295    pub fn element_dtype(&self) -> &Arc<DType> {
296        match &self.dtype {
297            DType::List(element_dtype, _) => element_dtype,
298            _ => vortex_panic!("ListArray has invalid dtype {}", self.dtype),
299        }
300    }
301
302    /// Returns the elements array.
303    pub fn elements(&self) -> &ArrayRef {
304        &self.elements
305    }
306
307    // TODO(connor)[ListView]: Create 2 functions `reset_offsets` and `recursive_reset_offsets`,
308    // where `reset_offsets` is infallible.
309    // Also, `reset_offsets` can be made more efficient by replacing `sub_scalar` with a match on
310    // the offset type and manual subtraction and fast path where `offsets[0] == 0`.
311
312    /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not
313    /// referenced by the `offsets`.
314    pub fn reset_offsets(&self, recurse: bool) -> VortexResult<Self> {
315        let mut elements = self.sliced_elements()?;
316        if recurse && elements.is_canonical() {
317            elements = elements.to_canonical()?.compact()?.into_array();
318        } else if recurse && let Some(child_list_array) = elements.as_opt::<ListVTable>() {
319            elements = child_list_array.reset_offsets(recurse)?.into_array();
320        }
321
322        let offsets = self.offsets();
323        let first_offset = offsets.scalar_at(0)?;
324        let adjusted_offsets = sub_scalar(offsets, first_offset)?;
325
326        Self::try_new(elements, adjusted_offsets, self.validity.clone())
327    }
328}