vortex_array/arrays/list/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::{DType, match_each_integer_ptype, match_each_native_ptype};
8use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure};
9
10use crate::arrays::{ListVTable, PrimitiveVTable};
11use crate::compute::{min_max, sub_scalar};
12use crate::stats::ArrayStats;
13use crate::validity::Validity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
17///
18/// This mirrors the Apache Arrow List array encoding and provides efficient storage
19/// for nested data where each row contains a list of elements of the same type.
20///
21/// ## Data Layout
22///
23/// The list array uses an offset-based encoding:
24/// - **Elements array**: A flat array containing all list elements concatenated together
25/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
26/// the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
27/// - **Validity**: Optional mask indicating which lists are null
28///
29/// This allows for excellent cascading compression of the elements and offsets, as similar values
30/// are clustered together and the offsets have a predictable pattern and small deltas between
31/// consecutive elements.
32///
33/// ## Offset Semantics
34///
35/// - Offsets must be non-nullable integers (i32, i64, etc.)
36/// - Offsets array has length `n+1` where `n` is the number of lists
37/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
38/// - Offsets must be monotonically increasing
39///
40/// # Examples
41///
42/// ```
43/// use vortex_array::arrays::{ListArray, PrimitiveArray};
44/// use vortex_array::validity::Validity;
45/// use vortex_array::IntoArray;
46/// use vortex_buffer::buffer;
47/// use std::sync::Arc;
48///
49/// // Create a list array representing [[1, 2], [3, 4, 5], []]
50/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
51/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
52///
53/// let list_array = ListArray::try_new(
54/// elements.into_array(),
55/// offsets.into_array(),
56/// Validity::NonNullable,
57/// ).unwrap();
58///
59/// assert_eq!(list_array.len(), 3);
60///
61/// // Access individual lists
62/// let first_list = list_array.list_elements_at(0);
63/// assert_eq!(first_list.len(), 2); // [1, 2]
64///
65/// let third_list = list_array.list_elements_at(2);
66/// assert!(third_list.is_empty()); // []
67/// ```
68#[derive(Clone, Debug)]
69pub struct ListArray {
70 pub(super) dtype: DType,
71 pub(super) elements: ArrayRef,
72 pub(super) offsets: ArrayRef,
73 pub(super) validity: Validity,
74 pub(super) stats_set: ArrayStats,
75}
76
77impl ListArray {
78 /// Creates a new [`ListArray`].
79 ///
80 /// # Panics
81 ///
82 /// Panics if the provided components do not satisfy the invariants documented
83 /// in [`ListArray::new_unchecked`].
84 pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
85 Self::try_new(elements, offsets, validity).vortex_expect("ListArray new")
86 }
87
88 /// Constructs a new `ListArray`.
89 ///
90 /// See [`ListArray::new_unchecked`] for more information.
91 ///
92 /// # Errors
93 ///
94 /// Returns an error if the provided components do not satisfy the invariants documented in
95 /// [`ListArray::new_unchecked`].
96 pub fn try_new(
97 elements: ArrayRef,
98 offsets: ArrayRef,
99 validity: Validity,
100 ) -> VortexResult<Self> {
101 Self::validate(&elements, &offsets, &validity)?;
102
103 // SAFETY: validate ensures all invariants are met.
104 Ok(unsafe { Self::new_unchecked(elements, offsets, validity) })
105 }
106
107 /// Creates a new [`ListArray`] without validation from these components:
108 ///
109 /// * `elements` is a flat array containing all list elements concatenated.
110 /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
111 /// * `validity` holds the null values.
112 ///
113 /// # Safety
114 ///
115 /// The caller must ensure all of the following invariants are satisfied:
116 ///
117 /// - Offsets must be a non-nullable integer array.
118 /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
119 /// - Offsets must be sorted (monotonically increasing).
120 /// - All offset values must be non-negative.
121 /// - The maximum offset must not exceed `elements.len()`.
122 /// - If validity is an array, its length must equal `offsets.len() - 1`.
123 pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
124 #[cfg(debug_assertions)]
125 Self::validate(&elements, &offsets, &validity)
126 .vortex_expect("[Debug Assertion]: Invalid `ListViewArray` parameters");
127
128 Self {
129 dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
130 elements,
131 offsets,
132 validity,
133 stats_set: Default::default(),
134 }
135 }
136
137 /// Validates the components that would be used to create a [`ListArray`].
138 ///
139 /// This function checks all the invariants required by [`ListArray::new_unchecked`].
140 pub fn validate(
141 elements: &dyn Array,
142 offsets: &dyn Array,
143 validity: &Validity,
144 ) -> VortexResult<()> {
145 // Offsets must have at least one element
146 vortex_ensure!(
147 !offsets.is_empty(),
148 "Offsets must have at least one element, [0] for an empty list"
149 );
150
151 // Offsets must be of integer type, and cannot go lower than 0.
152 vortex_ensure!(
153 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
154 "offsets have invalid type {}",
155 offsets.dtype()
156 );
157
158 // We can safely unwrap the DType as primitive now
159 let offsets_ptype = offsets.dtype().as_ptype();
160
161 // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
162 if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
163 vortex_ensure!(is_sorted, "offsets must be sorted");
164 } else {
165 vortex_bail!("offsets must report is_sorted statistic");
166 }
167
168 // Validate that offsets min is non-negative, and max does not exceed the length of
169 // the elements array.
170 if let Some(min_max) = min_max(offsets)? {
171 match_each_integer_ptype!(offsets_ptype, |P| {
172 let max_offset = P::try_from(offsets.scalar_at(offsets.len() - 1))
173 .vortex_expect("Offsets type must fit offsets values");
174
175 #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
176 {
177 if let Some(min) = min_max.min.as_primitive().as_::<P>() {
178 vortex_ensure!(
179 min >= 0 && min <= max_offset,
180 "offsets minimum {min} outside valid range [0, {max_offset}]"
181 );
182 }
183
184 if let Some(max) = min_max.max.as_primitive().as_::<P>() {
185 vortex_ensure!(
186 max >= 0 && max <= max_offset,
187 "offsets maximum {max} outside valid range [0, {max_offset}]"
188 )
189 }
190 }
191
192 vortex_ensure!(
193 max_offset
194 <= P::try_from(elements.len())
195 .vortex_expect("Offsets type must be able to fit elements length"),
196 "Max offset {max_offset} is beyond the length of the elements array {}",
197 elements.len()
198 );
199 })
200 } else {
201 // TODO(aduffy): fallback to slower validation pathway?
202 vortex_bail!(
203 "offsets array with encoding {} must support min_max compute function",
204 offsets.encoding_id()
205 );
206 };
207
208 // If a validity array is present, it must be the same length as the ListArray
209 if let Some(validity_len) = validity.maybe_len() {
210 vortex_ensure!(
211 validity_len == offsets.len() - 1,
212 "validity with size {validity_len} does not match array size {}",
213 offsets.len() - 1
214 );
215 }
216
217 Ok(())
218 }
219
220 /// Returns the offset at the given index from the list array.
221 ///
222 /// Panics if the index is out of bounds.
223 pub fn offset_at(&self, index: usize) -> usize {
224 assert!(
225 index <= self.len(),
226 "Index {index} out of bounds 0..={}",
227 self.len()
228 );
229
230 self.offsets()
231 .as_opt::<PrimitiveVTable>()
232 .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
233 .unwrap_or_else(|| {
234 self.offsets()
235 .scalar_at(index)
236 .as_primitive()
237 .as_::<usize>()
238 .vortex_expect("index must fit in usize")
239 })
240 }
241
242 /// Returns the elements of the list scalar at the given index of the list array.
243 pub fn list_elements_at(&self, index: usize) -> ArrayRef {
244 let start = self.offset_at(index);
245 let end = self.offset_at(index + 1);
246 self.elements().slice(start..end)
247 }
248
249 /// Returns elements of the list array referenced by the offsets array.
250 ///
251 /// This is useful for discarding any potentially unused parts of the underlying `elements`
252 /// child array.
253 pub fn sliced_elements(&self) -> ArrayRef {
254 let start = self.offset_at(0);
255 let end = self.offset_at(self.len());
256 self.elements().slice(start..end)
257 }
258
259 /// Returns the offsets array.
260 pub fn offsets(&self) -> &ArrayRef {
261 &self.offsets
262 }
263
264 /// Returns the elements array.
265 pub fn elements(&self) -> &ArrayRef {
266 &self.elements
267 }
268
269 /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not
270 /// referenced by the `offsets`.
271 pub fn reset_offsets(&self, recurse: bool) -> VortexResult<Self> {
272 let mut elements = self.sliced_elements();
273 if recurse && elements.is_canonical() {
274 elements = elements.to_canonical().compact()?.into_array();
275 } else if recurse && let Some(child_list_array) = elements.as_opt::<ListVTable>() {
276 elements = child_list_array.reset_offsets(recurse)?.into_array();
277 }
278
279 let offsets = self.offsets();
280 let first_offset = offsets.scalar_at(0);
281 let adjusted_offsets = sub_scalar(offsets, first_offset)?;
282
283 Self::try_new(elements, adjusted_offsets, self.validity.clone())
284 }
285}