Skip to main content

vortex_array/arrays/list/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Display;
5use std::fmt::Formatter;
6use std::sync::Arc;
7
8use num_traits::AsPrimitive;
9use smallvec::smallvec;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_panic;
15
16use crate::ArrayRef;
17use crate::ArraySlots;
18use crate::Canonical;
19use crate::ExecutionCtx;
20use crate::IntoArray;
21use crate::LEGACY_SESSION;
22use crate::VortexSessionExecute;
23use crate::aggregate_fn::fns::min_max::min_max;
24use crate::array::Array;
25use crate::array::ArrayParts;
26use crate::array::TypedArrayRef;
27use crate::array::child_to_validity;
28use crate::array::validity_to_child;
29use crate::arrays::ConstantArray;
30use crate::arrays::List;
31use crate::arrays::Primitive;
32use crate::builtins::ArrayBuiltins;
33use crate::dtype::DType;
34use crate::dtype::NativePType;
35use crate::match_each_integer_ptype;
36use crate::match_each_native_ptype;
37use crate::scalar_fn::fns::operators::Operator;
38use crate::validity::Validity;
39
40/// The elements data array containing all list elements concatenated together.
41pub(super) const ELEMENTS_SLOT: usize = 0;
42/// The offsets array defining the start/end of each list within the elements array.
43pub(super) const OFFSETS_SLOT: usize = 1;
44/// The validity bitmap indicating which list elements are non-null.
45pub(super) const VALIDITY_SLOT: usize = 2;
46pub(super) const NUM_SLOTS: usize = 3;
47pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["elements", "offsets", "validity"];
48
49/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
50///
51/// This mirrors the Apache Arrow List array encoding and provides efficient storage
52/// for nested data where each row contains a list of elements of the same type.
53///
54/// ## Data Layout
55///
56/// The list array uses an offset-based encoding:
57/// - **Elements array**: A flat array containing all list elements concatenated together
58/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
59///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
60/// - **Validity**: Optional mask indicating which lists are null
61///
62/// This allows for excellent cascading compression of the elements and offsets, as similar values
63/// are clustered together and the offsets have a predictable pattern and small deltas between
64/// consecutive elements.
65///
66/// ## Offset Semantics
67///
68/// - Offsets must be non-nullable integers (i32, i64, etc.)
69/// - Offsets array has length `n+1` where `n` is the number of lists
70/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
71/// - Offsets must be monotonically increasing
72///
73/// # Examples
74///
75/// ```
76/// use vortex_array::arrays::{ListArray, PrimitiveArray};
77/// use vortex_array::arrays::list::ListArrayExt;
78/// use vortex_array::validity::Validity;
79/// use vortex_array::IntoArray;
80/// use vortex_buffer::buffer;
81/// use std::sync::Arc;
82///
83/// // Create a list array representing [[1, 2], [3, 4, 5], []]
84/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
85/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
86///
87/// let list_array = ListArray::try_new(
88///     elements.into_array(),
89///     offsets.into_array(),
90///     Validity::NonNullable,
91/// ).unwrap();
92///
93/// assert_eq!(list_array.len(), 3);
94///
95/// // Access individual lists
96/// let first_list = list_array.list_elements_at(0).unwrap();
97/// assert_eq!(first_list.len(), 2); // [1, 2]
98///
99/// let third_list = list_array.list_elements_at(2).unwrap();
100/// assert!(third_list.is_empty()); // []
101/// ```
102#[derive(Clone, Debug, Default)]
103pub struct ListData;
104
105impl Display for ListData {
106    fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
107        Ok(())
108    }
109}
110
111pub struct ListDataParts {
112    pub elements: ArrayRef,
113    pub offsets: ArrayRef,
114    pub validity: Validity,
115    pub dtype: DType,
116}
117
118impl ListData {
119    pub(crate) fn make_slots(
120        elements: &ArrayRef,
121        offsets: &ArrayRef,
122        validity: &Validity,
123        len: usize,
124    ) -> ArraySlots {
125        smallvec![
126            Some(elements.clone()),
127            Some(offsets.clone()),
128            validity_to_child(validity, len),
129        ]
130    }
131
132    /// Creates a new `ListArray`.
133    ///
134    /// # Panics
135    ///
136    /// Panics if the provided components do not satisfy the invariants documented
137    /// in `ListArray::new_unchecked`.
138    pub fn build(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
139        Self::try_build(elements, offsets, validity).vortex_expect("ListArray new")
140    }
141
142    /// Constructs a new `ListArray`.
143    ///
144    /// See `ListArray::new_unchecked` for more information.
145    ///
146    /// # Errors
147    ///
148    /// Returns an error if the provided components do not satisfy the invariants documented in
149    /// `ListArray::new_unchecked`.
150    pub(crate) fn try_build(
151        elements: ArrayRef,
152        offsets: ArrayRef,
153        validity: Validity,
154    ) -> VortexResult<Self> {
155        Self::validate(&elements, &offsets, &validity)?;
156
157        // SAFETY: validate ensures all invariants are met.
158        Ok(unsafe { Self::new_unchecked() })
159    }
160
161    /// Creates a new `ListArray` without validation from these components:
162    ///
163    /// * `elements` is a flat array containing all list elements concatenated.
164    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
165    /// * `validity` holds the null values.
166    ///
167    /// # Safety
168    ///
169    /// The caller must ensure all of the following invariants are satisfied:
170    ///
171    /// - Offsets must be a non-nullable integer array.
172    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
173    /// - Offsets must be sorted (monotonically increasing).
174    /// - All offset values must be non-negative.
175    /// - The maximum offset must not exceed `elements.len()`.
176    /// - If validity is an array, its length must equal `offsets.len() - 1`.
177    pub unsafe fn new_unchecked() -> Self {
178        Self
179    }
180
181    /// Validates the components that would be used to create a `ListArray`.
182    ///
183    /// This function checks all the invariants required by `ListArray::new_unchecked`.
184    pub fn validate(
185        elements: &ArrayRef,
186        offsets: &ArrayRef,
187        validity: &Validity,
188    ) -> VortexResult<()> {
189        // Offsets must have at least one element
190        vortex_ensure!(
191            !offsets.is_empty(),
192            InvalidArgument: "Offsets must have at least one element, [0] for an empty list"
193        );
194
195        // Offsets must be of integer type, and cannot go lower than 0.
196        vortex_ensure!(
197            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
198            InvalidArgument: "offsets have invalid type {}",
199            offsets.dtype()
200        );
201
202        // We can safely unwrap the DType as primitive now
203        let offsets_ptype = offsets.dtype().as_ptype();
204        let mut ctx = LEGACY_SESSION.create_execution_ctx();
205
206        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
207        if let Some(is_sorted) = offsets.statistics().compute_is_sorted(&mut ctx) {
208            vortex_ensure!(is_sorted, InvalidArgument: "offsets must be sorted");
209        } else {
210            vortex_bail!(InvalidArgument: "offsets must report is_sorted statistic");
211        }
212
213        // Validate that offsets min is non-negative, and max does not exceed the length of
214        // the elements array.
215        if let Some(min_max) = min_max(offsets, &mut ctx)? {
216            match_each_integer_ptype!(offsets_ptype, |P| {
217                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
218                {
219                    let max = min_max
220                        .max
221                        .as_primitive()
222                        .as_::<P>()
223                        .vortex_expect("offsets type must fit offsets values");
224                    let min = min_max
225                        .min
226                        .as_primitive()
227                        .as_::<P>()
228                        .vortex_expect("offsets type must fit offsets values");
229
230                    vortex_ensure!(
231                        min >= 0,
232                        InvalidArgument: "offsets minimum {min} outside valid range [0, {max}]"
233                    );
234
235                    vortex_ensure!(
236                        max <= P::try_from(elements.len()).unwrap_or_else(|_| vortex_panic!(
237                            "Offsets type {} must be able to fit elements length {}",
238                            <P as NativePType>::PTYPE,
239                            elements.len()
240                        )),
241                        InvalidArgument: "Max offset {max} is beyond the length of the elements array {}",
242                        elements.len()
243                    );
244                }
245            })
246        } else {
247            // TODO(aduffy): fallback to slower validation pathway?
248            vortex_bail!(
249                InvalidArgument: "offsets array with encoding {} must support min_max compute function",
250                offsets.encoding_id()
251            );
252        };
253
254        // If a validity array is present, it must be the same length as the ListArray
255        if let Some(validity_len) = validity.maybe_len() {
256            vortex_ensure!(
257                validity_len == offsets.len() - 1,
258                InvalidArgument: "validity with size {validity_len} does not match array size {}",
259                offsets.len() - 1
260            );
261        }
262
263        Ok(())
264    }
265    // TODO(connor)[ListView]: Create 2 functions `reset_offsets` and `recursive_reset_offsets`,
266    // where `reset_offsets` is infallible.
267    // Also, `reset_offsets` can be made more efficient by replacing `sub_scalar` with a match on
268    // the offset type and manual subtraction and fast path where `offsets[0] == 0`.
269}
270
271pub trait ListArrayExt: TypedArrayRef<List> {
272    fn nullability(&self) -> crate::dtype::Nullability {
273        match self.as_ref().dtype() {
274            DType::List(_, nullability) => *nullability,
275            _ => unreachable!("ListArrayExt requires a list dtype"),
276        }
277    }
278
279    fn elements(&self) -> &ArrayRef {
280        self.as_ref().slots()[ELEMENTS_SLOT]
281            .as_ref()
282            .vortex_expect("ListArray elements slot")
283    }
284
285    fn offsets(&self) -> &ArrayRef {
286        self.as_ref().slots()[OFFSETS_SLOT]
287            .as_ref()
288            .vortex_expect("ListArray offsets slot")
289    }
290
291    fn list_validity(&self) -> Validity {
292        child_to_validity(
293            self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
294            self.nullability(),
295        )
296    }
297
298    fn offset_at(&self, index: usize) -> VortexResult<usize> {
299        vortex_ensure!(
300            index <= self.as_ref().len(),
301            "Index {index} out of bounds 0..={}",
302            self.as_ref().len()
303        );
304
305        if let Some(p) = self.offsets().as_opt::<Primitive>() {
306            Ok(match_each_native_ptype!(p.ptype(), |P| {
307                p.as_slice::<P>()[index].as_()
308            }))
309        } else {
310            self.offsets()
311                .execute_scalar(index, &mut LEGACY_SESSION.create_execution_ctx())?
312                .as_primitive()
313                .as_::<usize>()
314                .ok_or_else(|| vortex_error::vortex_err!("offset value does not fit in usize"))
315        }
316    }
317
318    fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
319        let start = self.offset_at(index)?;
320        let end = self.offset_at(index + 1)?;
321        self.elements().slice(start..end)
322    }
323
324    fn sliced_elements(&self) -> VortexResult<ArrayRef> {
325        let start = self.offset_at(0)?;
326        let end = self.offset_at(self.as_ref().len())?;
327        self.elements().slice(start..end)
328    }
329
330    fn element_dtype(&self) -> &DType {
331        self.elements().dtype()
332    }
333
334    fn reset_offsets(&self, recurse: bool, ctx: &mut ExecutionCtx) -> VortexResult<Array<List>> {
335        let mut elements = self.sliced_elements()?;
336        if recurse && elements.is_canonical() {
337            let compacted = elements
338                .clone()
339                .execute::<Canonical>(ctx)?
340                .compact(ctx)?
341                .into_array();
342            elements = compacted;
343        } else if recurse && let Some(child_list_array) = elements.as_opt::<List>() {
344            elements = child_list_array
345                .into_owned()
346                .reset_offsets(recurse, ctx)?
347                .into_array();
348        }
349
350        let offsets = self.offsets();
351        let first_offset = offsets.execute_scalar(0, ctx)?;
352        let adjusted_offsets = offsets.clone().binary(
353            ConstantArray::new(first_offset, offsets.len()).into_array(),
354            Operator::Sub,
355        )?;
356
357        Array::<List>::try_new(elements, adjusted_offsets, self.list_validity())
358    }
359}
360impl<T: TypedArrayRef<List>> ListArrayExt for T {}
361
362impl Array<List> {
363    /// Creates a new `ListArray`.
364    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
365        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
366        let len = offsets.len().saturating_sub(1);
367        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
368        let data = ListData::build(elements, offsets, validity);
369        unsafe {
370            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
371        }
372    }
373
374    /// Constructs a new `ListArray`.
375    pub fn try_new(
376        elements: ArrayRef,
377        offsets: ArrayRef,
378        validity: Validity,
379    ) -> VortexResult<Self> {
380        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
381        let len = offsets.len().saturating_sub(1);
382        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
383        let data = ListData::try_build(elements, offsets, validity)?;
384        Ok(unsafe {
385            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
386        })
387    }
388
389    /// Creates a new `ListArray` without validation.
390    ///
391    /// # Safety
392    ///
393    /// See [`ListData::new_unchecked`].
394    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
395        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
396        let len = offsets.len().saturating_sub(1);
397        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
398        let data = unsafe { ListData::new_unchecked() };
399        unsafe {
400            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
401        }
402    }
403
404    pub fn into_data_parts(self) -> ListDataParts {
405        let dtype = self.dtype().clone();
406        let elements = self.slots()[ELEMENTS_SLOT]
407            .clone()
408            .vortex_expect("ListArray elements slot");
409        let offsets = self.slots()[OFFSETS_SLOT]
410            .clone()
411            .vortex_expect("ListArray offsets slot");
412        let validity = self.list_validity();
413        ListDataParts {
414            elements,
415            offsets,
416            validity,
417            dtype,
418        }
419    }
420}