Skip to main content

vortex_array/arrays/list/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Display;
5use std::fmt::Formatter;
6use std::sync::Arc;
7
8use num_traits::AsPrimitive;
9use vortex_error::VortexExpect;
10use vortex_error::VortexResult;
11use vortex_error::vortex_bail;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_panic;
14
15use crate::ArrayRef;
16use crate::IntoArray;
17use crate::LEGACY_SESSION;
18use crate::VortexSessionExecute;
19use crate::aggregate_fn::fns::min_max::min_max;
20use crate::array::Array;
21use crate::array::ArrayParts;
22use crate::array::TypedArrayRef;
23use crate::array::child_to_validity;
24use crate::array::validity_to_child;
25use crate::arrays::ConstantArray;
26use crate::arrays::List;
27use crate::arrays::Primitive;
28use crate::builtins::ArrayBuiltins;
29use crate::dtype::DType;
30use crate::dtype::NativePType;
31use crate::match_each_integer_ptype;
32use crate::match_each_native_ptype;
33use crate::scalar_fn::fns::operators::Operator;
34use crate::validity::Validity;
35
36/// The elements data array containing all list elements concatenated together.
37pub(super) const ELEMENTS_SLOT: usize = 0;
38/// The offsets array defining the start/end of each list within the elements array.
39pub(super) const OFFSETS_SLOT: usize = 1;
40/// The validity bitmap indicating which list elements are non-null.
41pub(super) const VALIDITY_SLOT: usize = 2;
42pub(super) const NUM_SLOTS: usize = 3;
43pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["elements", "offsets", "validity"];
44
45/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
46///
47/// This mirrors the Apache Arrow List array encoding and provides efficient storage
48/// for nested data where each row contains a list of elements of the same type.
49///
50/// ## Data Layout
51///
52/// The list array uses an offset-based encoding:
53/// - **Elements array**: A flat array containing all list elements concatenated together
54/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
55///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
56/// - **Validity**: Optional mask indicating which lists are null
57///
58/// This allows for excellent cascading compression of the elements and offsets, as similar values
59/// are clustered together and the offsets have a predictable pattern and small deltas between
60/// consecutive elements.
61///
62/// ## Offset Semantics
63///
64/// - Offsets must be non-nullable integers (i32, i64, etc.)
65/// - Offsets array has length `n+1` where `n` is the number of lists
66/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
67/// - Offsets must be monotonically increasing
68///
69/// # Examples
70///
71/// ```
72/// use vortex_array::arrays::{ListArray, PrimitiveArray};
73/// use vortex_array::arrays::list::ListArrayExt;
74/// use vortex_array::validity::Validity;
75/// use vortex_array::IntoArray;
76/// use vortex_buffer::buffer;
77/// use std::sync::Arc;
78///
79/// // Create a list array representing [[1, 2], [3, 4, 5], []]
80/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
81/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
82///
83/// let list_array = ListArray::try_new(
84///     elements.into_array(),
85///     offsets.into_array(),
86///     Validity::NonNullable,
87/// ).unwrap();
88///
89/// assert_eq!(list_array.len(), 3);
90///
91/// // Access individual lists
92/// let first_list = list_array.list_elements_at(0).unwrap();
93/// assert_eq!(first_list.len(), 2); // [1, 2]
94///
95/// let third_list = list_array.list_elements_at(2).unwrap();
96/// assert!(third_list.is_empty()); // []
97/// ```
98#[derive(Clone, Debug, Default)]
99pub struct ListData;
100
101impl Display for ListData {
102    fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
103        Ok(())
104    }
105}
106
107pub struct ListDataParts {
108    pub elements: ArrayRef,
109    pub offsets: ArrayRef,
110    pub validity: Validity,
111    pub dtype: DType,
112}
113
114impl ListData {
115    pub(crate) fn make_slots(
116        elements: &ArrayRef,
117        offsets: &ArrayRef,
118        validity: &Validity,
119        len: usize,
120    ) -> Vec<Option<ArrayRef>> {
121        vec![
122            Some(elements.clone()),
123            Some(offsets.clone()),
124            validity_to_child(validity, len),
125        ]
126    }
127
128    /// Creates a new `ListArray`.
129    ///
130    /// # Panics
131    ///
132    /// Panics if the provided components do not satisfy the invariants documented
133    /// in `ListArray::new_unchecked`.
134    pub fn build(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
135        Self::try_build(elements, offsets, validity).vortex_expect("ListArray new")
136    }
137
138    /// Constructs a new `ListArray`.
139    ///
140    /// See `ListArray::new_unchecked` for more information.
141    ///
142    /// # Errors
143    ///
144    /// Returns an error if the provided components do not satisfy the invariants documented in
145    /// `ListArray::new_unchecked`.
146    pub(crate) fn try_build(
147        elements: ArrayRef,
148        offsets: ArrayRef,
149        validity: Validity,
150    ) -> VortexResult<Self> {
151        Self::validate(&elements, &offsets, &validity)?;
152
153        // SAFETY: validate ensures all invariants are met.
154        Ok(unsafe { Self::new_unchecked() })
155    }
156
157    /// Creates a new `ListArray` without validation from these components:
158    ///
159    /// * `elements` is a flat array containing all list elements concatenated.
160    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
161    /// * `validity` holds the null values.
162    ///
163    /// # Safety
164    ///
165    /// The caller must ensure all of the following invariants are satisfied:
166    ///
167    /// - Offsets must be a non-nullable integer array.
168    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
169    /// - Offsets must be sorted (monotonically increasing).
170    /// - All offset values must be non-negative.
171    /// - The maximum offset must not exceed `elements.len()`.
172    /// - If validity is an array, its length must equal `offsets.len() - 1`.
173    pub unsafe fn new_unchecked() -> Self {
174        Self
175    }
176
177    /// Validates the components that would be used to create a `ListArray`.
178    ///
179    /// This function checks all the invariants required by `ListArray::new_unchecked`.
180    pub fn validate(
181        elements: &ArrayRef,
182        offsets: &ArrayRef,
183        validity: &Validity,
184    ) -> VortexResult<()> {
185        // Offsets must have at least one element
186        vortex_ensure!(
187            !offsets.is_empty(),
188            InvalidArgument: "Offsets must have at least one element, [0] for an empty list"
189        );
190
191        // Offsets must be of integer type, and cannot go lower than 0.
192        vortex_ensure!(
193            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
194            InvalidArgument: "offsets have invalid type {}",
195            offsets.dtype()
196        );
197
198        // We can safely unwrap the DType as primitive now
199        let offsets_ptype = offsets.dtype().as_ptype();
200
201        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
202        if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
203            vortex_ensure!(is_sorted, InvalidArgument: "offsets must be sorted");
204        } else {
205            vortex_bail!(InvalidArgument: "offsets must report is_sorted statistic");
206        }
207
208        // Validate that offsets min is non-negative, and max does not exceed the length of
209        // the elements array.
210        let mut ctx = LEGACY_SESSION.create_execution_ctx();
211        if let Some(min_max) = min_max(offsets, &mut ctx)? {
212            match_each_integer_ptype!(offsets_ptype, |P| {
213                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
214                {
215                    let max = min_max
216                        .max
217                        .as_primitive()
218                        .as_::<P>()
219                        .vortex_expect("offsets type must fit offsets values");
220                    let min = min_max
221                        .min
222                        .as_primitive()
223                        .as_::<P>()
224                        .vortex_expect("offsets type must fit offsets values");
225
226                    vortex_ensure!(
227                        min >= 0,
228                        InvalidArgument: "offsets minimum {min} outside valid range [0, {max}]"
229                    );
230
231                    vortex_ensure!(
232                        max <= P::try_from(elements.len()).unwrap_or_else(|_| vortex_panic!(
233                            "Offsets type {} must be able to fit elements length {}",
234                            <P as NativePType>::PTYPE,
235                            elements.len()
236                        )),
237                        InvalidArgument: "Max offset {max} is beyond the length of the elements array {}",
238                        elements.len()
239                    );
240                }
241            })
242        } else {
243            // TODO(aduffy): fallback to slower validation pathway?
244            vortex_bail!(
245                InvalidArgument: "offsets array with encoding {} must support min_max compute function",
246                offsets.encoding_id()
247            );
248        };
249
250        // If a validity array is present, it must be the same length as the ListArray
251        if let Some(validity_len) = validity.maybe_len() {
252            vortex_ensure!(
253                validity_len == offsets.len() - 1,
254                InvalidArgument: "validity with size {validity_len} does not match array size {}",
255                offsets.len() - 1
256            );
257        }
258
259        Ok(())
260    }
261    // TODO(connor)[ListView]: Create 2 functions `reset_offsets` and `recursive_reset_offsets`,
262    // where `reset_offsets` is infallible.
263    // Also, `reset_offsets` can be made more efficient by replacing `sub_scalar` with a match on
264    // the offset type and manual subtraction and fast path where `offsets[0] == 0`.
265}
266
267pub trait ListArrayExt: TypedArrayRef<List> {
268    fn nullability(&self) -> crate::dtype::Nullability {
269        match self.as_ref().dtype() {
270            DType::List(_, nullability) => *nullability,
271            _ => unreachable!("ListArrayExt requires a list dtype"),
272        }
273    }
274
275    fn elements(&self) -> &ArrayRef {
276        self.as_ref().slots()[ELEMENTS_SLOT]
277            .as_ref()
278            .vortex_expect("ListArray elements slot")
279    }
280
281    fn offsets(&self) -> &ArrayRef {
282        self.as_ref().slots()[OFFSETS_SLOT]
283            .as_ref()
284            .vortex_expect("ListArray offsets slot")
285    }
286
287    fn list_validity(&self) -> Validity {
288        child_to_validity(&self.as_ref().slots()[VALIDITY_SLOT], self.nullability())
289    }
290
291    fn list_validity_mask(&self) -> vortex_mask::Mask {
292        self.list_validity().to_mask(self.as_ref().len())
293    }
294
295    fn offset_at(&self, index: usize) -> VortexResult<usize> {
296        vortex_ensure!(
297            index <= self.as_ref().len(),
298            "Index {index} out of bounds 0..={}",
299            self.as_ref().len()
300        );
301
302        if let Some(p) = self.offsets().as_opt::<Primitive>() {
303            Ok(match_each_native_ptype!(p.ptype(), |P| {
304                p.as_slice::<P>()[index].as_()
305            }))
306        } else {
307            self.offsets()
308                .scalar_at(index)?
309                .as_primitive()
310                .as_::<usize>()
311                .ok_or_else(|| vortex_error::vortex_err!("offset value does not fit in usize"))
312        }
313    }
314
315    fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
316        let start = self.offset_at(index)?;
317        let end = self.offset_at(index + 1)?;
318        self.elements().slice(start..end)
319    }
320
321    fn sliced_elements(&self) -> VortexResult<ArrayRef> {
322        let start = self.offset_at(0)?;
323        let end = self.offset_at(self.as_ref().len())?;
324        self.elements().slice(start..end)
325    }
326
327    fn element_dtype(&self) -> &DType {
328        self.elements().dtype()
329    }
330
331    fn reset_offsets(&self, recurse: bool) -> VortexResult<Array<List>> {
332        let mut elements = self.sliced_elements()?;
333        if recurse && elements.is_canonical() {
334            elements = elements.to_canonical()?.compact()?.into_array();
335        } else if recurse && let Some(child_list_array) = elements.as_opt::<List>() {
336            elements = child_list_array
337                .into_owned()
338                .reset_offsets(recurse)?
339                .into_array();
340        }
341
342        let offsets = self.offsets();
343        let first_offset = offsets.scalar_at(0)?;
344        let adjusted_offsets = offsets.clone().binary(
345            ConstantArray::new(first_offset, offsets.len()).into_array(),
346            Operator::Sub,
347        )?;
348
349        Array::<List>::try_new(elements, adjusted_offsets, self.list_validity())
350    }
351}
352impl<T: TypedArrayRef<List>> ListArrayExt for T {}
353
354impl Array<List> {
355    /// Creates a new `ListArray`.
356    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
357        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
358        let len = offsets.len().saturating_sub(1);
359        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
360        let data = ListData::build(elements, offsets, validity);
361        unsafe {
362            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
363        }
364    }
365
366    /// Constructs a new `ListArray`.
367    pub fn try_new(
368        elements: ArrayRef,
369        offsets: ArrayRef,
370        validity: Validity,
371    ) -> VortexResult<Self> {
372        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
373        let len = offsets.len().saturating_sub(1);
374        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
375        let data = ListData::try_build(elements, offsets, validity)?;
376        Ok(unsafe {
377            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
378        })
379    }
380
381    /// Creates a new `ListArray` without validation.
382    ///
383    /// # Safety
384    ///
385    /// See [`ListData::new_unchecked`].
386    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
387        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
388        let len = offsets.len().saturating_sub(1);
389        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
390        let data = unsafe { ListData::new_unchecked() };
391        unsafe {
392            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
393        }
394    }
395
396    pub fn into_data_parts(self) -> ListDataParts {
397        let dtype = self.dtype().clone();
398        let elements = self.slots()[ELEMENTS_SLOT]
399            .clone()
400            .vortex_expect("ListArray elements slot");
401        let offsets = self.slots()[OFFSETS_SLOT]
402            .clone()
403            .vortex_expect("ListArray offsets slot");
404        let validity = self.list_validity();
405        ListDataParts {
406            elements,
407            offsets,
408            validity,
409            dtype,
410        }
411    }
412}