vortex_array/arrays/list/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::DType;
8use vortex_dtype::NativePType;
9use vortex_dtype::match_each_integer_ptype;
10use vortex_dtype::match_each_native_ptype;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_error::vortex_ensure;
15use vortex_error::vortex_panic;
16
17use crate::Array;
18use crate::ArrayRef;
19use crate::IntoArray;
20use crate::arrays::ListVTable;
21use crate::arrays::PrimitiveVTable;
22use crate::compute::min_max;
23use crate::compute::sub_scalar;
24use crate::stats::ArrayStats;
25use crate::validity::Validity;
26
27/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
28///
29/// This mirrors the Apache Arrow List array encoding and provides efficient storage
30/// for nested data where each row contains a list of elements of the same type.
31///
32/// ## Data Layout
33///
34/// The list array uses an offset-based encoding:
35/// - **Elements array**: A flat array containing all list elements concatenated together
36/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
37/// the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
38/// - **Validity**: Optional mask indicating which lists are null
39///
40/// This allows for excellent cascading compression of the elements and offsets, as similar values
41/// are clustered together and the offsets have a predictable pattern and small deltas between
42/// consecutive elements.
43///
44/// ## Offset Semantics
45///
46/// - Offsets must be non-nullable integers (i32, i64, etc.)
47/// - Offsets array has length `n+1` where `n` is the number of lists
48/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
49/// - Offsets must be monotonically increasing
50///
51/// # Examples
52///
53/// ```
54/// use vortex_array::arrays::{ListArray, PrimitiveArray};
55/// use vortex_array::validity::Validity;
56/// use vortex_array::IntoArray;
57/// use vortex_buffer::buffer;
58/// use std::sync::Arc;
59///
60/// // Create a list array representing [[1, 2], [3, 4, 5], []]
61/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
62/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
63///
64/// let list_array = ListArray::try_new(
65/// elements.into_array(),
66/// offsets.into_array(),
67/// Validity::NonNullable,
68/// ).unwrap();
69///
70/// assert_eq!(list_array.len(), 3);
71///
72/// // Access individual lists
73/// let first_list = list_array.list_elements_at(0);
74/// assert_eq!(first_list.len(), 2); // [1, 2]
75///
76/// let third_list = list_array.list_elements_at(2);
77/// assert!(third_list.is_empty()); // []
78/// ```
79#[derive(Clone, Debug)]
80pub struct ListArray {
81 pub(super) dtype: DType,
82 pub(super) elements: ArrayRef,
83 pub(super) offsets: ArrayRef,
84 pub(super) validity: Validity,
85 pub(super) stats_set: ArrayStats,
86}
87
88impl ListArray {
89 /// Creates a new [`ListArray`].
90 ///
91 /// # Panics
92 ///
93 /// Panics if the provided components do not satisfy the invariants documented
94 /// in [`ListArray::new_unchecked`].
95 pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
96 Self::try_new(elements, offsets, validity).vortex_expect("ListArray new")
97 }
98
99 /// Constructs a new `ListArray`.
100 ///
101 /// See [`ListArray::new_unchecked`] for more information.
102 ///
103 /// # Errors
104 ///
105 /// Returns an error if the provided components do not satisfy the invariants documented in
106 /// [`ListArray::new_unchecked`].
107 pub fn try_new(
108 elements: ArrayRef,
109 offsets: ArrayRef,
110 validity: Validity,
111 ) -> VortexResult<Self> {
112 Self::validate(&elements, &offsets, &validity)?;
113
114 // SAFETY: validate ensures all invariants are met.
115 Ok(unsafe { Self::new_unchecked(elements, offsets, validity) })
116 }
117
118 /// Creates a new [`ListArray`] without validation from these components:
119 ///
120 /// * `elements` is a flat array containing all list elements concatenated.
121 /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
122 /// * `validity` holds the null values.
123 ///
124 /// # Safety
125 ///
126 /// The caller must ensure all of the following invariants are satisfied:
127 ///
128 /// - Offsets must be a non-nullable integer array.
129 /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
130 /// - Offsets must be sorted (monotonically increasing).
131 /// - All offset values must be non-negative.
132 /// - The maximum offset must not exceed `elements.len()`.
133 /// - If validity is an array, its length must equal `offsets.len() - 1`.
134 pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
135 #[cfg(debug_assertions)]
136 Self::validate(&elements, &offsets, &validity)
137 .vortex_expect("[Debug Assertion]: Invalid `ListViewArray` parameters");
138
139 Self {
140 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
141 elements,
142 offsets,
143 validity,
144 stats_set: Default::default(),
145 }
146 }
147
148 /// Validates the components that would be used to create a [`ListArray`].
149 ///
150 /// This function checks all the invariants required by [`ListArray::new_unchecked`].
151 pub fn validate(
152 elements: &dyn Array,
153 offsets: &dyn Array,
154 validity: &Validity,
155 ) -> VortexResult<()> {
156 // Offsets must have at least one element
157 vortex_ensure!(
158 !offsets.is_empty(),
159 "Offsets must have at least one element, [0] for an empty list"
160 );
161
162 // Offsets must be of integer type, and cannot go lower than 0.
163 vortex_ensure!(
164 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
165 "offsets have invalid type {}",
166 offsets.dtype()
167 );
168
169 // We can safely unwrap the DType as primitive now
170 let offsets_ptype = offsets.dtype().as_ptype();
171
172 // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
173 if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
174 vortex_ensure!(is_sorted, "offsets must be sorted");
175 } else {
176 vortex_bail!("offsets must report is_sorted statistic");
177 }
178
179 // Validate that offsets min is non-negative, and max does not exceed the length of
180 // the elements array.
181 if let Some(min_max) = min_max(offsets)? {
182 match_each_integer_ptype!(offsets_ptype, |P| {
183 let max_offset = P::try_from(offsets.scalar_at(offsets.len() - 1))
184 .vortex_expect("Offsets type must fit offsets values");
185
186 #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
187 {
188 if let Some(min) = min_max.min.as_primitive().as_::<P>() {
189 vortex_ensure!(
190 min >= 0 && min <= max_offset,
191 "offsets minimum {min} outside valid range [0, {max_offset}]"
192 );
193 }
194
195 if let Some(max) = min_max.max.as_primitive().as_::<P>() {
196 vortex_ensure!(
197 max >= 0 && max <= max_offset,
198 "offsets maximum {max} outside valid range [0, {max_offset}]"
199 )
200 }
201 }
202
203 vortex_ensure!(
204 max_offset
205 <= P::try_from(elements.len()).unwrap_or_else(|_| vortex_panic!(
206 "Offsets type {} must be able to fit elements length {}",
207 <P as NativePType>::PTYPE,
208 elements.len()
209 )),
210 "Max offset {max_offset} is beyond the length of the elements array {}",
211 elements.len()
212 );
213 })
214 } else {
215 // TODO(aduffy): fallback to slower validation pathway?
216 vortex_bail!(
217 "offsets array with encoding {} must support min_max compute function",
218 offsets.encoding_id()
219 );
220 };
221
222 // If a validity array is present, it must be the same length as the ListArray
223 if let Some(validity_len) = validity.maybe_len() {
224 vortex_ensure!(
225 validity_len == offsets.len() - 1,
226 "validity with size {validity_len} does not match array size {}",
227 offsets.len() - 1
228 );
229 }
230
231 Ok(())
232 }
233
234 /// Returns the offset at the given index from the list array.
235 ///
236 /// Panics if the index is out of bounds.
237 pub fn offset_at(&self, index: usize) -> usize {
238 assert!(
239 index <= self.len(),
240 "Index {index} out of bounds 0..={}",
241 self.len()
242 );
243
244 self.offsets()
245 .as_opt::<PrimitiveVTable>()
246 .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
247 .unwrap_or_else(|| {
248 self.offsets()
249 .scalar_at(index)
250 .as_primitive()
251 .as_::<usize>()
252 .vortex_expect("index must fit in usize")
253 })
254 }
255
256 /// Returns the elements of the list scalar at the given index of the list array.
257 pub fn list_elements_at(&self, index: usize) -> ArrayRef {
258 let start = self.offset_at(index);
259 let end = self.offset_at(index + 1);
260 self.elements().slice(start..end)
261 }
262
263 /// Returns elements of the list array referenced by the offsets array.
264 ///
265 /// This is useful for discarding any potentially unused parts of the underlying `elements`
266 /// child array.
267 pub fn sliced_elements(&self) -> ArrayRef {
268 let start = self.offset_at(0);
269 let end = self.offset_at(self.len());
270 self.elements().slice(start..end)
271 }
272
273 /// Returns the offsets array.
274 pub fn offsets(&self) -> &ArrayRef {
275 &self.offsets
276 }
277
278 /// Returns the elements array.
279 pub fn elements(&self) -> &ArrayRef {
280 &self.elements
281 }
282
283 // TODO(connor)[ListView]: Create 2 functions `reset_offsets` and `recursive_reset_offsets`,
284 // where `reset_offsets` is infallible.
285 // Also, `reset_offsets` can be made more efficient by replacing `sub_scalar` with a match on
286 // the offset type and manual subtraction and fast path where `offsets[0] == 0`.
287
288 /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not
289 /// referenced by the `offsets`.
290 pub fn reset_offsets(&self, recurse: bool) -> VortexResult<Self> {
291 let mut elements = self.sliced_elements();
292 if recurse && elements.is_canonical() {
293 elements = elements.to_canonical().compact()?.into_array();
294 } else if recurse && let Some(child_list_array) = elements.as_opt::<ListVTable>() {
295 elements = child_list_array.reset_offsets(recurse)?.into_array();
296 }
297
298 let offsets = self.offsets();
299 let first_offset = offsets.scalar_at(0);
300 let adjusted_offsets = sub_scalar(offsets, first_offset)?;
301
302 Self::try_new(elements, adjusted_offsets, self.validity.clone())
303 }
304}