Skip to main content

vortex_array/arrays/list/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Display;
5use std::fmt::Formatter;
6use std::sync::Arc;
7
8use num_traits::AsPrimitive;
9use smallvec::smallvec;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_error::vortex_ensure;
14use vortex_error::vortex_panic;
15
16use crate::ArrayRef;
17use crate::ArraySlots;
18use crate::IntoArray;
19use crate::LEGACY_SESSION;
20use crate::VortexSessionExecute;
21use crate::aggregate_fn::fns::min_max::min_max;
22use crate::array::Array;
23use crate::array::ArrayParts;
24use crate::array::TypedArrayRef;
25use crate::array::child_to_validity;
26use crate::array::validity_to_child;
27use crate::arrays::ConstantArray;
28use crate::arrays::List;
29use crate::arrays::Primitive;
30use crate::builtins::ArrayBuiltins;
31use crate::dtype::DType;
32use crate::dtype::NativePType;
33use crate::match_each_integer_ptype;
34use crate::match_each_native_ptype;
35use crate::scalar_fn::fns::operators::Operator;
36use crate::validity::Validity;
37
38/// The elements data array containing all list elements concatenated together.
39pub(super) const ELEMENTS_SLOT: usize = 0;
40/// The offsets array defining the start/end of each list within the elements array.
41pub(super) const OFFSETS_SLOT: usize = 1;
42/// The validity bitmap indicating which list elements are non-null.
43pub(super) const VALIDITY_SLOT: usize = 2;
44pub(super) const NUM_SLOTS: usize = 3;
45pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["elements", "offsets", "validity"];
46
47/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
48///
49/// This mirrors the Apache Arrow List array encoding and provides efficient storage
50/// for nested data where each row contains a list of elements of the same type.
51///
52/// ## Data Layout
53///
54/// The list array uses an offset-based encoding:
55/// - **Elements array**: A flat array containing all list elements concatenated together
56/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
57///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
58/// - **Validity**: Optional mask indicating which lists are null
59///
60/// This allows for excellent cascading compression of the elements and offsets, as similar values
61/// are clustered together and the offsets have a predictable pattern and small deltas between
62/// consecutive elements.
63///
64/// ## Offset Semantics
65///
66/// - Offsets must be non-nullable integers (i32, i64, etc.)
67/// - Offsets array has length `n+1` where `n` is the number of lists
68/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
69/// - Offsets must be monotonically increasing
70///
71/// # Examples
72///
73/// ```
74/// use vortex_array::arrays::{ListArray, PrimitiveArray};
75/// use vortex_array::arrays::list::ListArrayExt;
76/// use vortex_array::validity::Validity;
77/// use vortex_array::IntoArray;
78/// use vortex_buffer::buffer;
79/// use std::sync::Arc;
80///
81/// // Create a list array representing [[1, 2], [3, 4, 5], []]
82/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
83/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
84///
85/// let list_array = ListArray::try_new(
86///     elements.into_array(),
87///     offsets.into_array(),
88///     Validity::NonNullable,
89/// ).unwrap();
90///
91/// assert_eq!(list_array.len(), 3);
92///
93/// // Access individual lists
94/// let first_list = list_array.list_elements_at(0).unwrap();
95/// assert_eq!(first_list.len(), 2); // [1, 2]
96///
97/// let third_list = list_array.list_elements_at(2).unwrap();
98/// assert!(third_list.is_empty()); // []
99/// ```
100#[derive(Clone, Debug, Default)]
101pub struct ListData;
102
103impl Display for ListData {
104    fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
105        Ok(())
106    }
107}
108
109pub struct ListDataParts {
110    pub elements: ArrayRef,
111    pub offsets: ArrayRef,
112    pub validity: Validity,
113    pub dtype: DType,
114}
115
116impl ListData {
117    pub(crate) fn make_slots(
118        elements: &ArrayRef,
119        offsets: &ArrayRef,
120        validity: &Validity,
121        len: usize,
122    ) -> ArraySlots {
123        smallvec![
124            Some(elements.clone()),
125            Some(offsets.clone()),
126            validity_to_child(validity, len),
127        ]
128    }
129
130    /// Creates a new `ListArray`.
131    ///
132    /// # Panics
133    ///
134    /// Panics if the provided components do not satisfy the invariants documented
135    /// in `ListArray::new_unchecked`.
136    pub fn build(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
137        Self::try_build(elements, offsets, validity).vortex_expect("ListArray new")
138    }
139
140    /// Constructs a new `ListArray`.
141    ///
142    /// See `ListArray::new_unchecked` for more information.
143    ///
144    /// # Errors
145    ///
146    /// Returns an error if the provided components do not satisfy the invariants documented in
147    /// `ListArray::new_unchecked`.
148    pub(crate) fn try_build(
149        elements: ArrayRef,
150        offsets: ArrayRef,
151        validity: Validity,
152    ) -> VortexResult<Self> {
153        Self::validate(&elements, &offsets, &validity)?;
154
155        // SAFETY: validate ensures all invariants are met.
156        Ok(unsafe { Self::new_unchecked() })
157    }
158
159    /// Creates a new `ListArray` without validation from these components:
160    ///
161    /// * `elements` is a flat array containing all list elements concatenated.
162    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
163    /// * `validity` holds the null values.
164    ///
165    /// # Safety
166    ///
167    /// The caller must ensure all of the following invariants are satisfied:
168    ///
169    /// - Offsets must be a non-nullable integer array.
170    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
171    /// - Offsets must be sorted (monotonically increasing).
172    /// - All offset values must be non-negative.
173    /// - The maximum offset must not exceed `elements.len()`.
174    /// - If validity is an array, its length must equal `offsets.len() - 1`.
175    pub unsafe fn new_unchecked() -> Self {
176        Self
177    }
178
179    /// Validates the components that would be used to create a `ListArray`.
180    ///
181    /// This function checks all the invariants required by `ListArray::new_unchecked`.
182    pub fn validate(
183        elements: &ArrayRef,
184        offsets: &ArrayRef,
185        validity: &Validity,
186    ) -> VortexResult<()> {
187        // Offsets must have at least one element
188        vortex_ensure!(
189            !offsets.is_empty(),
190            InvalidArgument: "Offsets must have at least one element, [0] for an empty list"
191        );
192
193        // Offsets must be of integer type, and cannot go lower than 0.
194        vortex_ensure!(
195            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
196            InvalidArgument: "offsets have invalid type {}",
197            offsets.dtype()
198        );
199
200        // We can safely unwrap the DType as primitive now
201        let offsets_ptype = offsets.dtype().as_ptype();
202        let mut ctx = LEGACY_SESSION.create_execution_ctx();
203
204        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
205        if let Some(is_sorted) = offsets.statistics().compute_is_sorted(&mut ctx) {
206            vortex_ensure!(is_sorted, InvalidArgument: "offsets must be sorted");
207        } else {
208            vortex_bail!(InvalidArgument: "offsets must report is_sorted statistic");
209        }
210
211        // Validate that offsets min is non-negative, and max does not exceed the length of
212        // the elements array.
213        if let Some(min_max) = min_max(offsets, &mut ctx)? {
214            match_each_integer_ptype!(offsets_ptype, |P| {
215                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
216                {
217                    let max = min_max
218                        .max
219                        .as_primitive()
220                        .as_::<P>()
221                        .vortex_expect("offsets type must fit offsets values");
222                    let min = min_max
223                        .min
224                        .as_primitive()
225                        .as_::<P>()
226                        .vortex_expect("offsets type must fit offsets values");
227
228                    vortex_ensure!(
229                        min >= 0,
230                        InvalidArgument: "offsets minimum {min} outside valid range [0, {max}]"
231                    );
232
233                    vortex_ensure!(
234                        max <= P::try_from(elements.len()).unwrap_or_else(|_| vortex_panic!(
235                            "Offsets type {} must be able to fit elements length {}",
236                            <P as NativePType>::PTYPE,
237                            elements.len()
238                        )),
239                        InvalidArgument: "Max offset {max} is beyond the length of the elements array {}",
240                        elements.len()
241                    );
242                }
243            })
244        } else {
245            // TODO(aduffy): fallback to slower validation pathway?
246            vortex_bail!(
247                InvalidArgument: "offsets array with encoding {} must support min_max compute function",
248                offsets.encoding_id()
249            );
250        };
251
252        // If a validity array is present, it must be the same length as the ListArray
253        if let Some(validity_len) = validity.maybe_len() {
254            vortex_ensure!(
255                validity_len == offsets.len() - 1,
256                InvalidArgument: "validity with size {validity_len} does not match array size {}",
257                offsets.len() - 1
258            );
259        }
260
261        Ok(())
262    }
263    // TODO(connor)[ListView]: Create 2 functions `reset_offsets` and `recursive_reset_offsets`,
264    // where `reset_offsets` is infallible.
265    // Also, `reset_offsets` can be made more efficient by replacing `sub_scalar` with a match on
266    // the offset type and manual subtraction and fast path where `offsets[0] == 0`.
267}
268
269pub trait ListArrayExt: TypedArrayRef<List> {
270    fn nullability(&self) -> crate::dtype::Nullability {
271        match self.as_ref().dtype() {
272            DType::List(_, nullability) => *nullability,
273            _ => unreachable!("ListArrayExt requires a list dtype"),
274        }
275    }
276
277    fn elements(&self) -> &ArrayRef {
278        self.as_ref().slots()[ELEMENTS_SLOT]
279            .as_ref()
280            .vortex_expect("ListArray elements slot")
281    }
282
283    fn offsets(&self) -> &ArrayRef {
284        self.as_ref().slots()[OFFSETS_SLOT]
285            .as_ref()
286            .vortex_expect("ListArray offsets slot")
287    }
288
289    fn list_validity(&self) -> Validity {
290        child_to_validity(
291            self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
292            self.nullability(),
293        )
294    }
295
296    fn offset_at(&self, index: usize) -> VortexResult<usize> {
297        vortex_ensure!(
298            index <= self.as_ref().len(),
299            "Index {index} out of bounds 0..={}",
300            self.as_ref().len()
301        );
302
303        if let Some(p) = self.offsets().as_opt::<Primitive>() {
304            Ok(match_each_native_ptype!(p.ptype(), |P| {
305                p.as_slice::<P>()[index].as_()
306            }))
307        } else {
308            self.offsets()
309                .execute_scalar(index, &mut LEGACY_SESSION.create_execution_ctx())?
310                .as_primitive()
311                .as_::<usize>()
312                .ok_or_else(|| vortex_error::vortex_err!("offset value does not fit in usize"))
313        }
314    }
315
316    fn list_elements_at(&self, index: usize) -> VortexResult<ArrayRef> {
317        let start = self.offset_at(index)?;
318        let end = self.offset_at(index + 1)?;
319        self.elements().slice(start..end)
320    }
321
322    fn sliced_elements(&self) -> VortexResult<ArrayRef> {
323        let start = self.offset_at(0)?;
324        let end = self.offset_at(self.as_ref().len())?;
325        self.elements().slice(start..end)
326    }
327
328    fn element_dtype(&self) -> &DType {
329        self.elements().dtype()
330    }
331
332    fn reset_offsets(&self, recurse: bool) -> VortexResult<Array<List>> {
333        let mut elements = self.sliced_elements()?;
334        if recurse && elements.is_canonical() {
335            #[expect(deprecated)]
336            let compacted = elements.to_canonical()?.compact()?.into_array();
337            elements = compacted;
338        } else if recurse && let Some(child_list_array) = elements.as_opt::<List>() {
339            elements = child_list_array
340                .into_owned()
341                .reset_offsets(recurse)?
342                .into_array();
343        }
344
345        let offsets = self.offsets();
346        let first_offset = offsets.execute_scalar(0, &mut LEGACY_SESSION.create_execution_ctx())?;
347        let adjusted_offsets = offsets.clone().binary(
348            ConstantArray::new(first_offset, offsets.len()).into_array(),
349            Operator::Sub,
350        )?;
351
352        Array::<List>::try_new(elements, adjusted_offsets, self.list_validity())
353    }
354}
355impl<T: TypedArrayRef<List>> ListArrayExt for T {}
356
357impl Array<List> {
358    /// Creates a new `ListArray`.
359    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
360        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
361        let len = offsets.len().saturating_sub(1);
362        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
363        let data = ListData::build(elements, offsets, validity);
364        unsafe {
365            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
366        }
367    }
368
369    /// Constructs a new `ListArray`.
370    pub fn try_new(
371        elements: ArrayRef,
372        offsets: ArrayRef,
373        validity: Validity,
374    ) -> VortexResult<Self> {
375        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
376        let len = offsets.len().saturating_sub(1);
377        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
378        let data = ListData::try_build(elements, offsets, validity)?;
379        Ok(unsafe {
380            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
381        })
382    }
383
384    /// Creates a new `ListArray` without validation.
385    ///
386    /// # Safety
387    ///
388    /// See [`ListData::new_unchecked`].
389    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
390        let dtype = DType::List(Arc::new(elements.dtype().clone()), validity.nullability());
391        let len = offsets.len().saturating_sub(1);
392        let slots = ListData::make_slots(&elements, &offsets, &validity, len);
393        let data = unsafe { ListData::new_unchecked() };
394        unsafe {
395            Array::from_parts_unchecked(ArrayParts::new(List, dtype, len, data).with_slots(slots))
396        }
397    }
398
399    pub fn into_data_parts(self) -> ListDataParts {
400        let dtype = self.dtype().clone();
401        let elements = self.slots()[ELEMENTS_SLOT]
402            .clone()
403            .vortex_expect("ListArray elements slot");
404        let offsets = self.slots()[OFFSETS_SLOT]
405            .clone()
406            .vortex_expect("ListArray offsets slot");
407        let validity = self.list_validity();
408        ListDataParts {
409            elements,
410            offsets,
411            validity,
412            dtype,
413        }
414    }
415}