vortex_array/arrays/list/array.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::sync::Arc;
5
6use num_traits::AsPrimitive;
7use vortex_dtype::{DType, match_each_integer_ptype, match_each_native_ptype};
8use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure};
9
10use crate::arrays::{ListVTable, PrimitiveVTable};
11use crate::compute::{min_max, sub_scalar};
12use crate::stats::ArrayStats;
13use crate::validity::Validity;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
17///
18/// This mirrors the Apache Arrow List array encoding and provides efficient storage
19/// for nested data where each row contains a list of elements of the same type.
20///
21/// ## Data Layout
22///
23/// The list array uses an offset-based encoding:
24/// - **Elements array**: A flat array containing all list elements concatenated together
25/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
26///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
27/// - **Validity**: Optional mask indicating which lists are null
28///
29/// This allows for excellent cascading compression of the elements and offsets, as similar values
30/// are clustered together and the offsets have a predictable pattern and small deltas between
31/// consecutive elements.
32///
33/// ## Offset Semantics
34///
35/// - Offsets must be non-nullable integers (i32, i64, etc.)
36/// - Offsets array has length `n+1` where `n` is the number of lists
37/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
38/// - Offsets must be monotonically increasing
39///
40/// # Examples
41///
42/// ```
43/// use vortex_array::arrays::{ListArray, PrimitiveArray};
44/// use vortex_array::validity::Validity;
45/// use vortex_array::IntoArray;
46/// use vortex_buffer::buffer;
47/// use std::sync::Arc;
48///
49/// // Create a list array representing [[1, 2], [3, 4, 5], []]
50/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
51/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
52///
53/// let list_array = ListArray::try_new(
54///     elements.into_array(),
55///     offsets.into_array(),
56///     Validity::NonNullable,
57/// ).unwrap();
58///
59/// assert_eq!(list_array.len(), 3);
60///
61/// // Access individual lists
62/// let first_list = list_array.list_elements_at(0);
63/// assert_eq!(first_list.len(), 2); // [1, 2]
64///
65/// let third_list = list_array.list_elements_at(2);
66/// assert!(third_list.is_empty()); // []
67/// ```
68#[derive(Clone, Debug)]
69pub struct ListArray {
70    pub(super) dtype: DType,
71    pub(super) elements: ArrayRef,
72    pub(super) offsets: ArrayRef,
73    pub(super) validity: Validity,
74    pub(super) stats_set: ArrayStats,
75}
76
77impl ListArray {
78    /// Creates a new [`ListArray`].
79    ///
80    /// # Panics
81    ///
82    /// Panics if the provided components do not satisfy the invariants documented
83    /// in [`ListArray::new_unchecked`].
84    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
85        Self::try_new(elements, offsets, validity).vortex_expect("ListArray new")
86    }
87
88    /// Constructs a new `ListArray`.
89    ///
90    /// See [`ListArray::new_unchecked`] for more information.
91    ///
92    /// # Errors
93    ///
94    /// Returns an error if the provided components do not satisfy the invariants documented in
95    /// [`ListArray::new_unchecked`].
96    pub fn try_new(
97        elements: ArrayRef,
98        offsets: ArrayRef,
99        validity: Validity,
100    ) -> VortexResult<Self> {
101        Self::validate(&elements, &offsets, &validity)?;
102
103        // SAFETY: validate ensures all invariants are met.
104        Ok(unsafe { Self::new_unchecked(elements, offsets, validity) })
105    }
106
107    /// Creates a new [`ListArray`] without validation from these components:
108    ///
109    /// * `elements` is a flat array containing all list elements concatenated.
110    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
111    /// * `validity` holds the null values.
112    ///
113    /// # Safety
114    ///
115    /// The caller must ensure all of the following invariants are satisfied:
116    ///
117    /// - Offsets must be a non-nullable integer array.
118    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
119    /// - Offsets must be sorted (monotonically increasing).
120    /// - All offset values must be non-negative.
121    /// - The maximum offset must not exceed `elements.len()`.
122    /// - If validity is an array, its length must equal `offsets.len() - 1`.
123    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
124        #[cfg(debug_assertions)]
125        Self::validate(&elements, &offsets, &validity)
126            .vortex_expect("[Debug Assertion]: Invalid `ListViewArray` parameters");
127
128        Self {
129            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
130            elements,
131            offsets,
132            validity,
133            stats_set: Default::default(),
134        }
135    }
136
137    /// Validates the components that would be used to create a [`ListArray`].
138    ///
139    /// This function checks all the invariants required by [`ListArray::new_unchecked`].
140    pub fn validate(
141        elements: &dyn Array,
142        offsets: &dyn Array,
143        validity: &Validity,
144    ) -> VortexResult<()> {
145        // Offsets must have at least one element
146        vortex_ensure!(
147            !offsets.is_empty(),
148            "Offsets must have at least one element, [0] for an empty list"
149        );
150
151        // Offsets must be of integer type, and cannot go lower than 0.
152        vortex_ensure!(
153            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
154            "offsets have invalid type {}",
155            offsets.dtype()
156        );
157
158        // We can safely unwrap the DType as primitive now
159        let offsets_ptype = offsets.dtype().as_ptype();
160
161        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
162        if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
163            vortex_ensure!(is_sorted, "offsets must be sorted");
164        } else {
165            vortex_bail!("offsets must report is_sorted statistic");
166        }
167
168        // Validate that offsets min is non-negative, and max does not exceed the length of
169        // the elements array.
170        if let Some(min_max) = min_max(offsets)? {
171            match_each_integer_ptype!(offsets_ptype, |P| {
172                let max_offset = P::try_from(offsets.scalar_at(offsets.len() - 1))
173                    .vortex_expect("Offsets type must fit offsets values");
174
175                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
176                {
177                    if let Some(min) = min_max.min.as_primitive().as_::<P>() {
178                        vortex_ensure!(
179                            min >= 0 && min <= max_offset,
180                            "offsets minimum {min} outside valid range [0, {max_offset}]"
181                        );
182                    }
183
184                    if let Some(max) = min_max.max.as_primitive().as_::<P>() {
185                        vortex_ensure!(
186                            max >= 0 && max <= max_offset,
187                            "offsets maximum {max} outside valid range [0, {max_offset}]"
188                        )
189                    }
190                }
191
192                vortex_ensure!(
193                    max_offset
194                        <= P::try_from(elements.len())
195                            .vortex_expect("Offsets type must be able to fit elements length"),
196                    "Max offset {max_offset} is beyond the length of the elements array {}",
197                    elements.len()
198                );
199            })
200        } else {
201            // TODO(aduffy): fallback to slower validation pathway?
202            vortex_bail!(
203                "offsets array with encoding {} must support min_max compute function",
204                offsets.encoding_id()
205            );
206        };
207
208        // If a validity array is present, it must be the same length as the ListArray
209        if let Some(validity_len) = validity.maybe_len() {
210            vortex_ensure!(
211                validity_len == offsets.len() - 1,
212                "validity with size {validity_len} does not match array size {}",
213                offsets.len() - 1
214            );
215        }
216
217        Ok(())
218    }
219
220    /// Returns the offset at the given index from the list array.
221    ///
222    /// Panics if the index is out of bounds.
223    pub fn offset_at(&self, index: usize) -> usize {
224        assert!(
225            index <= self.len(),
226            "Index {index} out of bounds 0..={}",
227            self.len()
228        );
229
230        self.offsets()
231            .as_opt::<PrimitiveVTable>()
232            .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
233            .unwrap_or_else(|| {
234                self.offsets()
235                    .scalar_at(index)
236                    .as_primitive()
237                    .as_::<usize>()
238                    .vortex_expect("index must fit in usize")
239            })
240    }
241
242    /// Returns the elements of the list scalar at the given index of the list array.
243    pub fn list_elements_at(&self, index: usize) -> ArrayRef {
244        let start = self.offset_at(index);
245        let end = self.offset_at(index + 1);
246        self.elements().slice(start..end)
247    }
248
249    /// Returns elements of the list array referenced by the offsets array.
250    ///
251    /// This is useful for discarding any potentially unused parts of the underlying `elements`
252    /// child array.
253    pub fn sliced_elements(&self) -> ArrayRef {
254        let start = self.offset_at(0);
255        let end = self.offset_at(self.len());
256        self.elements().slice(start..end)
257    }
258
259    /// Returns the offsets array.
260    pub fn offsets(&self) -> &ArrayRef {
261        &self.offsets
262    }
263
264    /// Returns the elements array.
265    pub fn elements(&self) -> &ArrayRef {
266        &self.elements
267    }
268
269    /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not
270    /// referenced by the `offsets`.
271    pub fn reset_offsets(&self, recurse: bool) -> VortexResult<Self> {
272        let mut elements = self.sliced_elements();
273        if recurse && elements.is_canonical() {
274            elements = elements.to_canonical().compact()?.into_array();
275        } else if recurse && let Some(child_list_array) = elements.as_opt::<ListVTable>() {
276            elements = child_list_array.reset_offsets(recurse)?.into_array();
277        }
278
279        let offsets = self.offsets();
280        let first_offset = offsets.scalar_at(0);
281        let adjusted_offsets = sub_scalar(offsets, first_offset)?;
282
283        Self::try_new(elements, adjusted_offsets, self.validity.clone())
284    }
285}