vortex_array/arrays/list/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod compute;
5mod serde;
6
7use std::ops::Range;
8use std::sync::Arc;
9
10#[cfg(feature = "test-harness")]
11use itertools::Itertools;
12use num_traits::AsPrimitive;
13use vortex_dtype::{DType, match_each_integer_ptype, match_each_native_ptype};
14use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure};
15use vortex_scalar::Scalar;
16
17#[cfg(feature = "test-harness")]
18use crate::OffsetPType;
19use crate::arrays::PrimitiveVTable;
20#[cfg(feature = "test-harness")]
21use crate::builders::{ArrayBuilder, ListBuilder};
22use crate::compute::{min_max, sub_scalar};
23use crate::stats::{ArrayStats, StatsSetRef};
24use crate::validity::Validity;
25use crate::vtable::{
26    ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityHelper,
27    ValidityVTableFromValidityHelper,
28};
29use crate::{Array, ArrayRef, Canonical, EncodingId, EncodingRef, IntoArray, vtable};
30
31vtable!(List);
32
33impl VTable for ListVTable {
34    type Array = ListArray;
35    type Encoding = ListEncoding;
36
37    type ArrayVTable = Self;
38    type CanonicalVTable = Self;
39    type OperationsVTable = Self;
40    type ValidityVTable = ValidityVTableFromValidityHelper;
41    type VisitorVTable = Self;
42    type ComputeVTable = NotSupported;
43    type EncodeVTable = NotSupported;
44    type PipelineVTable = NotSupported;
45    type SerdeVTable = Self;
46
47    fn id(_encoding: &Self::Encoding) -> EncodingId {
48        EncodingId::new_ref("vortex.list")
49    }
50
51    fn encoding(_array: &Self::Array) -> EncodingRef {
52        EncodingRef::new_ref(ListEncoding.as_ref())
53    }
54}
55
56/// A list array that stores variable-length lists of elements, similar to `Vec<Vec<T>>`.
57///
58/// This mirrors the Apache Arrow List array encoding and provides efficient storage
59/// for nested data where each row contains a list of elements of the same type.
60///
61/// ## Data Layout
62///
63/// The list array uses an offset-based encoding:
64/// - **Elements array**: A flat array containing all list elements concatenated together
65/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into
66///   the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list.
67/// - **Validity**: Optional mask indicating which lists are null
68///
69/// This allows for excellent cascading compression of the elements and offsets, as similar values
70/// are clustered together and the offsets have a predictable pattern and small deltas between
71/// consecutive elements.
72///
73/// ## Offset Semantics
74///
75/// - Offsets must be non-nullable integers (i32, i64, etc.)
76/// - Offsets array has length `n+1` where `n` is the number of lists
77/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]`
78/// - Offsets must be monotonically increasing
79///
80/// # Examples
81///
82/// ```
83/// use vortex_array::arrays::{ListArray, PrimitiveArray};
84/// use vortex_array::validity::Validity;
85/// use vortex_array::IntoArray;
86/// use vortex_buffer::buffer;
87/// use std::sync::Arc;
88///
89/// // Create a list array representing [[1, 2], [3, 4, 5], []]
90/// let elements = buffer![1i32, 2, 3, 4, 5].into_array();
91/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists
92///
93/// let list_array = ListArray::try_new(
94///     elements.into_array(),
95///     offsets.into_array(),
96///     Validity::NonNullable,
97/// ).unwrap();
98///
99/// assert_eq!(list_array.len(), 3);
100///
101/// // Access individual lists
102/// let first_list = list_array.list_elements_at(0);
103/// assert_eq!(first_list.len(), 2); // [1, 2]
104///
105/// let third_list = list_array.list_elements_at(2);
106/// assert!(third_list.is_empty()); // []
107/// ```
108#[derive(Clone, Debug)]
109pub struct ListArray {
110    dtype: DType,
111    elements: ArrayRef,
112    offsets: ArrayRef,
113    validity: Validity,
114    stats_set: ArrayStats,
115}
116
117#[derive(Clone, Debug)]
118pub struct ListEncoding;
119
120impl ListArray {
121    /// Creates a new [`ListArray`].
122    ///
123    /// # Panics
124    ///
125    /// Panics if the provided components do not satisfy the invariants documented
126    /// in [`ListArray::new_unchecked`].
127    pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
128        Self::try_new(elements, offsets, validity).vortex_expect("ListArray new")
129    }
130
131    /// Constructs a new `ListArray`.
132    ///
133    /// See [`ListArray::new_unchecked`] for more information.
134    ///
135    /// # Errors
136    ///
137    /// Returns an error if the provided components do not satisfy the invariants documented in
138    /// [`ListArray::new_unchecked`].
139    pub fn try_new(
140        elements: ArrayRef,
141        offsets: ArrayRef,
142        validity: Validity,
143    ) -> VortexResult<Self> {
144        Self::validate(&elements, &offsets, &validity)?;
145
146        // SAFETY: validate ensures all invariants are met.
147        Ok(unsafe { Self::new_unchecked(elements, offsets, validity) })
148    }
149
150    /// Creates a new [`ListArray`] without validation from these components:
151    ///
152    /// * `elements` is a flat array containing all list elements concatenated.
153    /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`.
154    /// * `validity` holds the null values.
155    ///
156    /// # Safety
157    ///
158    /// The caller must ensure all of the following invariants are satisfied:
159    ///
160    /// - Offsets must be a non-nullable integer array.
161    /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]).
162    /// - Offsets must be sorted (monotonically increasing).
163    /// - All offset values must be non-negative.
164    /// - The maximum offset must not exceed `elements.len()`.
165    /// - If validity is an array, its length must equal `offsets.len() - 1`.
166    pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self {
167        Self {
168            dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()),
169            elements,
170            offsets,
171            validity,
172            stats_set: Default::default(),
173        }
174    }
175
176    /// Validates the components that would be used to create a [`ListArray`].
177    ///
178    /// This function checks all the invariants required by [`ListArray::new_unchecked`].
179    pub(crate) fn validate(
180        elements: &dyn Array,
181        offsets: &dyn Array,
182        validity: &Validity,
183    ) -> VortexResult<()> {
184        // Offsets must have at least one element
185        vortex_ensure!(
186            !offsets.is_empty(),
187            "Offsets must have at least one element, [0] for an empty list"
188        );
189
190        // Offsets must be of integer type, and cannot go lower than 0.
191        vortex_ensure!(
192            offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
193            "offsets have invalid type {}",
194            offsets.dtype()
195        );
196
197        // We can safely unwrap the DType as primitive now
198        let offsets_ptype = offsets.dtype().as_ptype();
199
200        // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed)
201        if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
202            vortex_ensure!(is_sorted, "offsets must be sorted");
203        } else {
204            vortex_bail!("offsets must report is_sorted statistic");
205        }
206
207        // Validate that offsets min is non-negative, and max does not exceed the length of
208        // the elements array.
209        if let Some(min_max) = min_max(offsets)? {
210            match_each_integer_ptype!(offsets_ptype, |P| {
211                let max_offset = P::try_from(offsets.scalar_at(offsets.len() - 1))
212                    .vortex_expect("Offsets type must fit offsets values");
213
214                #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)]
215                {
216                    if let Some(min) = min_max.min.as_primitive().as_::<P>() {
217                        vortex_ensure!(
218                            min >= 0 && min <= max_offset,
219                            "offsets minimum {min} outside valid range [0, {max_offset}]"
220                        );
221                    }
222
223                    if let Some(max) = min_max.max.as_primitive().as_::<P>() {
224                        vortex_ensure!(
225                            max >= 0 && max <= max_offset,
226                            "offsets maximum {max} outside valid range [0, {max_offset}]"
227                        )
228                    }
229                }
230
231                vortex_ensure!(
232                    max_offset
233                        <= P::try_from(elements.len())
234                            .vortex_expect("Offsets type must be able to fit elements length"),
235                    "Max offset {max_offset} is beyond the length of the elements array {}",
236                    elements.len()
237                );
238            })
239        } else {
240            // TODO(aduffy): fallback to slower validation pathway?
241            vortex_bail!(
242                "offsets array with encoding {} must support min_max compute function",
243                offsets.encoding_id()
244            );
245        };
246
247        // If a validity array is present, it must be the same length as the ListArray
248        if let Some(validity_len) = validity.maybe_len() {
249            vortex_ensure!(
250                validity_len == offsets.len() - 1,
251                "validity with size {validity_len} does not match array size {}",
252                offsets.len() - 1
253            );
254        }
255
256        Ok(())
257    }
258
259    /// Returns the offset at the given index from the list array.
260    ///
261    /// Panics if the index is out of bounds.
262    pub fn offset_at(&self, index: usize) -> usize {
263        assert!(
264            index <= self.len(),
265            "Index {index} out of bounds 0..={}",
266            self.len()
267        );
268
269        self.offsets()
270            .as_opt::<PrimitiveVTable>()
271            .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::<P>()[index].as_() }))
272            .unwrap_or_else(|| {
273                self.offsets()
274                    .scalar_at(index)
275                    .as_primitive()
276                    .as_::<usize>()
277                    .vortex_expect("index must fit in usize")
278            })
279    }
280
281    /// Returns the elements of the list scalar at the given index of the list array.
282    pub fn list_elements_at(&self, index: usize) -> ArrayRef {
283        let start = self.offset_at(index);
284        let end = self.offset_at(index + 1);
285        self.elements().slice(start..end)
286    }
287
288    /// Returns elements of the list array referenced by the offsets array.
289    ///
290    /// This is useful for discarding any potentially unused parts of the underlying `elements`
291    /// child array.
292    pub fn sliced_elements(&self) -> ArrayRef {
293        let start = self.offset_at(0);
294        let end = self.offset_at(self.len());
295        self.elements().slice(start..end)
296    }
297
298    /// Returns the offsets array.
299    pub fn offsets(&self) -> &ArrayRef {
300        &self.offsets
301    }
302
303    /// Returns the elements array.
304    pub fn elements(&self) -> &ArrayRef {
305        &self.elements
306    }
307
308    /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not
309    /// referenced by the `offsets`.
310    pub fn reset_offsets(&self) -> VortexResult<Self> {
311        let elements = self.sliced_elements();
312        let offsets = self.offsets();
313        let first_offset = offsets.scalar_at(0);
314        let adjusted_offsets = sub_scalar(offsets, first_offset)?;
315
316        Self::try_new(elements, adjusted_offsets, self.validity.clone())
317    }
318}
319
320impl ArrayVTable<ListVTable> for ListVTable {
321    fn len(array: &ListArray) -> usize {
322        array.offsets.len().saturating_sub(1)
323    }
324
325    fn dtype(array: &ListArray) -> &DType {
326        &array.dtype
327    }
328
329    fn stats(array: &ListArray) -> StatsSetRef<'_> {
330        array.stats_set.to_ref(array.as_ref())
331    }
332}
333
334impl OperationsVTable<ListVTable> for ListVTable {
335    fn slice(array: &ListArray, range: Range<usize>) -> ArrayRef {
336        ListArray::new(
337            array.elements().clone(),
338            array.offsets().slice(range.start..range.end + 1),
339            array.validity().slice(range),
340        )
341        .into_array()
342    }
343
344    fn scalar_at(array: &ListArray, index: usize) -> Scalar {
345        // By the preconditions we know that the list scalar is not null.
346        let elems = array.list_elements_at(index);
347        let scalars: Vec<Scalar> = (0..elems.len()).map(|i| elems.scalar_at(i)).collect();
348
349        Scalar::list(
350            Arc::new(elems.dtype().clone()),
351            scalars,
352            array.dtype().nullability(),
353        )
354    }
355}
356
357impl CanonicalVTable<ListVTable> for ListVTable {
358    fn canonicalize(array: &ListArray) -> Canonical {
359        Canonical::List(array.clone())
360    }
361}
362
363impl ValidityHelper for ListArray {
364    fn validity(&self) -> &Validity {
365        &self.validity
366    }
367}
368
369#[cfg(feature = "test-harness")]
370impl ListArray {
371    /// This is a convenience method to create a list array from an iterator of iterators.
372    /// This method is slow however since each element is first converted to a scalar and then
373    /// appended to the array.
374    pub fn from_iter_slow<O: OffsetPType, I: IntoIterator>(
375        iter: I,
376        dtype: Arc<DType>,
377    ) -> VortexResult<ArrayRef>
378    where
379        I::Item: IntoIterator,
380        <I::Item as IntoIterator>::Item: Into<Scalar>,
381    {
382        let iter = iter.into_iter();
383        let mut builder = ListBuilder::<O>::with_capacity(
384            dtype.clone(),
385            vortex_dtype::Nullability::NonNullable,
386            iter.size_hint().0,
387        );
388
389        for v in iter {
390            let elem = Scalar::list(
391                dtype.clone(),
392                v.into_iter().map(|x| x.into()).collect_vec(),
393                dtype.nullability(),
394            );
395            builder.append_value(elem.as_list())?
396        }
397        Ok(builder.finish())
398    }
399
400    pub fn from_iter_opt_slow<O: OffsetPType, I: IntoIterator<Item = Option<T>>, T>(
401        iter: I,
402        dtype: Arc<DType>,
403    ) -> VortexResult<ArrayRef>
404    where
405        T: IntoIterator,
406        T::Item: Into<Scalar>,
407    {
408        let iter = iter.into_iter();
409        let mut builder = ListBuilder::<O>::with_capacity(
410            dtype.clone(),
411            vortex_dtype::Nullability::Nullable,
412            iter.size_hint().0,
413        );
414
415        for v in iter {
416            if let Some(v) = v {
417                let elem = Scalar::list(
418                    dtype.clone(),
419                    v.into_iter().map(|x| x.into()).collect_vec(),
420                    dtype.nullability(),
421                );
422                builder.append_value(elem.as_list())?
423            } else {
424                builder.append_null()
425            }
426        }
427        Ok(builder.finish())
428    }
429}
430
431#[cfg(test)]
432mod tests;