Skip to main content

vortex_array/arrays/list/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Display;
5use std::fmt::Formatter;
6use std::sync::Arc;
7
8use num_traits::AsPrimitive;
9use smallvec::smallvec;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_panic;
15
16use crate::ArrayRef;
17use crate::ArraySlots;
18use crate::Canonical;
19use crate::ExecutionCtx;
20use crate::IntoArray;
21use crate::LEGACY_SESSION;
22use crate::VortexSessionExecute;
23use crate::aggregate_fn::NumericalAggregateOpts;
24use crate::aggregate_fn::fns::min_max::min_max;
25use crate::array::Array;
26use crate::array::ArrayParts;
27use crate::array::TypedArrayRef;
28use crate::array::child_to_validity;
29use crate::array::validity_to_child;
30use crate::arrays::ConstantArray;
31use crate::arrays::List;
32use crate::arrays::Primitive;
33use crate::builtins::ArrayBuiltins;
34use crate::dtype::DType;
35use crate::dtype::NativePType;
36use crate::match_each_integer_ptype;
37use crate::match_each_native_ptype;
38use crate::scalar_fn::fns::operators::Operator;
39use crate::validity::Validity;
40
41/// The elements data array containing all list elements concatenated together.
42pub(super) const ELEMENTS_SLOT: usize = 0;
43/// The offsets array defining the start/end of each list within the elements array.
44pub(super) const OFFSETS_SLOT: usize = 1;
45/// The validity bitmap indicating which list elements are non-null.
46pub(super) const VALIDITY_SLOT: usize = 2;
47pub(super) const NUM_SLOTS: usize = 3;
48pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["elements", "offsets", "validity"];
49
50/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
51///
52/// This mirrors the Apache Arrow List array encoding and provides efficient storage
53/// for nested data where each row contains a list of elements of the same type.
54///
55/// ## Data Layout
56///
57/// The list array uses an offset-based encoding:
58/// - **Elements array**: A flat array containing all list elements concatenated together
59/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
60///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
61/// - **Validity**: Optional mask indicating which lists are null
62///
63/// This allows for excellent cascading compression of the elements and offsets, as similar values
64/// are clustered together and the offsets have a predictable pattern and small deltas between
65/// consecutive elements.
66///
67/// ## Offset Semantics
68///
69/// - Offsets must be non-nullable integers (i32, i64, etc.)
70/// - Offsets array has length `n+1` where `n` is the number of lists
71/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
72/// - Offsets must be monotonically increasing
73///
74/// # Examples
75///
76/// ```
77/// use vortex_array::arrays::{ListArray, PrimitiveArray};
78/// use vortex_array::arrays::list::ListArrayExt;
79/// use vortex_array::validity::Validity;
80/// use vortex_array::IntoArray;
81/// use vortex_buffer::buffer;
82/// use std::sync::Arc;
83///
84/// // Create a list array representing [[1, 2], [3, 4, 5], []]
85/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
86/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
87///
88/// let list_array = ListArray::try_new(
89///     elements.into_array(),
90///     offsets.into_array(),
91///     Validity::NonNullable,
92/// ).unwrap();
93///
94/// assert_eq!(list_array.len(), 3);
95///
96/// // Access individual lists
97/// let first_list = list_array.list_elements_at(0).unwrap();
98/// assert_eq!(first_list.len(), 2); // [1, 2]
99///
100/// let third_list = list_array.list_elements_at(2).unwrap();
101/// assert!(third_list.is_empty()); // []
102/// ```
103#[derive(Clone, Debug, Default)]
104pub struct ListData;
105
106impl Display for ListData {
107    fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
108        Ok(())
109    }
110}
111
112pub struct ListDataParts {
113    pub elements: ArrayRef,
114    pub offsets: ArrayRef,
115    pub validity: Validity,
116    pub dtype: DType,
117}
118
119impl ListData {
120    pub(crate) fn make_slots(
121        elements: &ArrayRef,
122        offsets: &ArrayRef,
123        validity: &Validity,
124        len: usize,
125    ) -> ArraySlots {
126        smallvec![
127            Some(elements.clone()),
128            Some(offsets.clone()),
129            validity_to_child(validity, len),
130        ]
131    }
132
133    /// Creates a new `ListArray`.
134    ///
135    /// # Panics
136    ///
137    /// Panics if the provided components do not satisfy the invariants documented
138    /// in `ListArray::new_unchecked`.
139    pub fn build(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
140        Self::try_build(elements, offsets, validity).vortex_expect("ListArray new")
141    }
142
143    /// Constructs a new `ListArray`.
144    ///
145    /// See `ListArray::new_unchecked` for more information.
146    ///
147    /// # Errors
148    ///
149    /// Returns an error if the provided components do not satisfy the invariants documented in
150    /// `ListArray::new_unchecked`.
151    pub(crate) fn try_build(
152        elements: ArrayRef,
153        offsets: ArrayRef,
154        validity: Validity,
155    ) -> VortexResult<Self> {
156        Self::validate(&elements, &offsets, &validity)?;
157
158        // SAFETY: validate ensures all invariants are met.
159        Ok(unsafe { Self::new_unchecked() })
160    }
161
162    /// Creates a new `ListArray` without validation from these components:
163    ///
164    /// * `elements` is a flat array containing all list elements concatenated.
165    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
166    /// * `validity` holds the null values.
167    ///
168    /// # Safety
169    ///
170    /// The caller must ensure all of the following invariants are satisfied:
171    ///
172    /// - Offsets must be a non-nullable integer array.
173    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
174    /// - Offsets must be sorted (monotonically increasing).
175    /// - All offset values must be non-negative.
176    /// - The maximum offset must not exceed `elements.len()`.
177    /// - If validity is an array, its length must equal `offsets.len() - 1`.
178    pub unsafe fn new_unchecked() -> Self {
179        Self
180    }
181
182    /// Validates the components that would be used to create a `ListArray`.
183    ///
184    /// This function checks all the invariants required by `ListArray::new_unchecked`.
185    pub fn validate(
186        elements: &ArrayRef,
187        offsets: &ArrayRef,
188        validity: &Validity,
189    ) -> VortexResult<()> {
190        // Offsets must have at least one element
191        vortex_ensure!(
192            !offsets.is_empty(),
193            InvalidArgument: "Offsets must have at least one element, [0] for an empty list"
194        );
195
196        // Offsets must be of integer type, and cannot go lower than 0.
197        vortex_ensure!(
198            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
199            InvalidArgument: "offsets have invalid type {}",
200            offsets.dtype()
201        );
202
203        // We can safely unwrap the DType as primitive now
204        let offsets_ptype = offsets.dtype().as_ptype();
205        let mut ctx = LEGACY_SESSION.create_execution_ctx();
206
207        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
208        if let Some(is_sorted) = offsets.statistics().compute_is_sorted(&mut ctx) {
209            vortex_ensure!(is_sorted, InvalidArgument: "offsets must be sorted");
210        } else {
211            vortex_bail!(InvalidArgument: "offsets must report is_sorted statistic");
212        }
213
214        // Validate that offsets min is non-negative, and max does not exceed the length of
215        // the elements array.
216        if let Some(min_max) = min_max(offsets, &mut ctx, NumericalAggregateOpts::default())? {
217            match_each_integer_ptype!(offsets_ptype, |P| {
218                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
219                {
220                    let max = min_max
221                        .max
222                        .as_primitive()
223                        .as_::<P>()
224                        .vortex_expect("offsets type must fit offsets values");
225                    let min = min_max
226                        .min
227                        .as_primitive()
228                        .as_::<P>()
229                        .vortex_expect("offsets type must fit offsets values");
230
231                    vortex_ensure!(
232                        min >= 0,
233                        InvalidArgument: "offsets minimum {min} outside valid range [0, {max}]"
234                    );
235
236                    vortex_ensure!(
237                        max <= P::try_from(elements.len()).unwrap_or_else(|_| vortex_panic!(
238                            "Offsets type {} must be able to fit elements length {}",
239                            <P as NativePType>::PTYPE,
240                            elements.len()
241                        )),
242                        InvalidArgument: "Max offset {max} is beyond the length of the elements array {}",
243                        elements.len()
244                    );
245                }
246            })
247        } else {
248            // TODO(aduffy): fallback to slower validation pathway?
249            vortex_bail!(
250                InvalidArgument: "offsets array with encoding {} must support min_max compute function",
251                offsets.encoding_id()
252            );
253        };
254
255        // If a validity array is present, it must be the same length as the ListArray
256        if let Some(validity_len) = validity.maybe_len() {
257            vortex_ensure!(
258                validity_len == offsets.len() - 1,
259                InvalidArgument: "validity with size {validity_len} does not match array size {}",
260                offsets.len() - 1
261            );
262        }
263
264        Ok(())
265    }
266    // TODO(connor)[ListView]: Create 2 functions `reset_offsets` and `recursive_reset_offsets`,
267    // where `reset_offsets` is infallible.
268    // Also, `reset_offsets` can be made more efficient by replacing `sub_scalar` with a match on
269    // the offset type and manual subtraction and fast path where `offsets[0] == 0`.
270}
271
272pub trait ListArrayExt: TypedArrayRef<List> {
273    fn nullability(&self) -> crate::dtype::Nullability {
274        match self.as_ref().dtype() {
275            DType::List(_, nullability) => *nullability,
276            _ => unreachable!("ListArrayExt requires a list dtype"),
277        }
278    }
279
280    fn elements(&self) -> &ArrayRef {
281        self.as_ref().slots()[ELEMENTS_SLOT]
282            .as_ref()
283            .vortex_expect("ListArray elements slot")
284    }
285
286    fn offsets(&self) -> &ArrayRef {
287        self.as_ref().slots()[OFFSETS_SLOT]
288            .as_ref()
289            .vortex_expect("ListArray offsets slot")
290    }
291
292    fn list_validity(&self) -> Validity {
293        child_to_validity(
294            self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
295            self.nullability(),
296        )
297    }
298
299    fn offset_at(&self, index: usize) -> VortexResult<usize> {
300        vortex_ensure!(
301            index <= self.as_ref().len(),
302            "Index {index} out of bounds 0..={}",
303            self.as_ref().len()
304        );
305
306        if let Some(p) = self.offsets().as_opt::<Primitive>() {
307            Ok(match_each_native_ptype!(p.ptype(), |P| {
308                p.as_slice::<P>()[index].as_()
309            }))
310        } else {
311            self.offsets()
312                .execute_scalar(index, &mut LEGACY_SESSION.create_execution_ctx())?
313                .as_primitive()
314                .as_::<usize>()
315                .ok_or_else(|| vortex_error::vortex_err!("offset value does not fit in usize"))
316        }
317    }
318
319    fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
320        let start = self.offset_at(index)?;
321        let end = self.offset_at(index + 1)?;
322        self.elements().slice(start..end)
323    }
324
325    fn sliced_elements(&self) -> VortexResult<ArrayRef> {
326        let start = self.offset_at(0)?;
327        let end = self.offset_at(self.as_ref().len())?;
328        self.elements().slice(start..end)
329    }
330
331    fn element_dtype(&self) -> &DType {
332        self.elements().dtype()
333    }
334
335    fn reset_offsets(&self, recurse: bool, ctx: &mut ExecutionCtx) -> VortexResult<Array<List>> {
336        let mut elements = self.sliced_elements()?;
337        if recurse && elements.is_canonical() {
338            let compacted = elements
339                .clone()
340                .execute::<Canonical>(ctx)?
341                .compact(ctx)?
342                .into_array();
343            elements = compacted;
344        } else if recurse && let Some(child_list_array) = elements.as_opt::<List>() {
345            elements = child_list_array
346                .into_owned()
347                .reset_offsets(recurse, ctx)?
348                .into_array();
349        }
350
351        let offsets = self.offsets();
352        let first_offset = offsets.execute_scalar(0, ctx)?;
353        let adjusted_offsets = offsets.clone().binary(
354            ConstantArray::new(first_offset, offsets.len()).into_array(),
355            Operator::Sub,
356        )?;
357
358        Array::<List>::try_new(elements, adjusted_offsets, self.list_validity())
359    }
360}
361impl<T: TypedArrayRef<List>> ListArrayExt for T {}
362
363impl Array<List> {
364    /// Creates a new `ListArray`.
365    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
366        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
367        let len = offsets.len().saturating_sub(1);
368        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
369        let data = ListData::build(elements, offsets, validity);
370        unsafe {
371            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
372        }
373    }
374
375    /// Constructs a new `ListArray`.
376    pub fn try_new(
377        elements: ArrayRef,
378        offsets: ArrayRef,
379        validity: Validity,
380    ) -> VortexResult<Self> {
381        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
382        let len = offsets.len().saturating_sub(1);
383        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
384        let data = ListData::try_build(elements, offsets, validity)?;
385        Ok(unsafe {
386            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
387        })
388    }
389
390    /// Creates a new `ListArray` without validation.
391    ///
392    /// # Safety
393    ///
394    /// See [`ListData::new_unchecked`].
395    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
396        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
397        let len = offsets.len().saturating_sub(1);
398        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
399        let data = unsafe { ListData::new_unchecked() };
400        unsafe {
401            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
402        }
403    }
404
405    pub fn into_data_parts(self) -> ListDataParts {
406        let dtype = self.dtype().clone();
407        let elements = self.slots()[ELEMENTS_SLOT]
408            .clone()
409            .vortex_expect("ListArray elements slot");
410        let offsets = self.slots()[OFFSETS_SLOT]
411            .clone()
412            .vortex_expect("ListArray offsets slot");
413        let validity = self.list_validity();
414        ListDataParts {
415            elements,
416            offsets,
417            validity,
418            dtype,
419        }
420    }
421}