polars_arrow/array/list/
mod.rs

1use super::specification::try_check_offsets_bounds;
2use super::{Array, Splitable, new_empty_array};
3use crate::bitmap::Bitmap;
4use crate::datatypes::{ArrowDataType, Field};
5use crate::offset::{Offset, Offsets, OffsetsBuffer};
6
7mod builder;
8pub use builder::*;
9mod ffi;
10pub(super) mod fmt;
11mod iterator;
12pub use iterator::*;
13mod mutable;
14pub use mutable::*;
15use polars_error::{PolarsResult, polars_bail};
16use polars_utils::pl_str::PlSmallStr;
17#[cfg(feature = "proptest")]
18pub mod proptest;
19
20/// Name used for the values array within List/FixedSizeList arrays.
21pub const LIST_VALUES_NAME: PlSmallStr = PlSmallStr::from_static("item");
22
23/// An [`Array`] semantically equivalent to `Vec<Option<Vec<Option<T>>>>` with Arrow's in-memory.
24#[derive(Clone)]
25pub struct ListArray<O: Offset> {
26    dtype: ArrowDataType,
27    offsets: OffsetsBuffer<O>,
28    values: Box<dyn Array>,
29    validity: Option<Bitmap>,
30}
31
32impl<O: Offset> ListArray<O> {
33    /// Creates a new [`ListArray`].
34    ///
35    /// # Errors
36    /// This function returns an error iff:
37    /// * `offsets.last()` is greater than `values.len()`.
38    /// * the validity's length is not equal to `offsets.len_proxy()`.
39    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
40    /// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
41    /// # Implementation
42    /// This function is `O(1)`
43    pub fn try_new(
44        dtype: ArrowDataType,
45        offsets: OffsetsBuffer<O>,
46        values: Box<dyn Array>,
47        validity: Option<Bitmap>,
48    ) -> PolarsResult<Self> {
49        try_check_offsets_bounds(&offsets, values.len())?;
50
51        if validity
52            .as_ref()
53            .is_some_and(|validity| validity.len() != offsets.len_proxy())
54        {
55            polars_bail!(ComputeError: "validity mask length must match the number of values")
56        }
57
58        let child_dtype = Self::try_get_child(&dtype)?.dtype();
59        let values_dtype = values.dtype();
60        if child_dtype != values_dtype {
61            polars_bail!(ComputeError: "ListArray's child's DataType must match. However, the expected DataType is {child_dtype:?} while it got {values_dtype:?}.");
62        }
63
64        Ok(Self {
65            dtype,
66            offsets,
67            values,
68            validity,
69        })
70    }
71
72    /// Creates a new [`ListArray`].
73    ///
74    /// # Panics
75    /// This function panics iff:
76    /// * `offsets.last()` is greater than `values.len()`.
77    /// * the validity's length is not equal to `offsets.len_proxy()`.
78    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
79    /// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
80    /// # Implementation
81    /// This function is `O(1)`
82    pub fn new(
83        dtype: ArrowDataType,
84        offsets: OffsetsBuffer<O>,
85        values: Box<dyn Array>,
86        validity: Option<Bitmap>,
87    ) -> Self {
88        Self::try_new(dtype, offsets, values, validity).unwrap()
89    }
90
91    /// Returns a new empty [`ListArray`].
92    pub fn new_empty(dtype: ArrowDataType) -> Self {
93        let values = new_empty_array(Self::get_child_type(&dtype).clone());
94        Self::new(dtype, OffsetsBuffer::default(), values, None)
95    }
96
97    /// Returns a new null [`ListArray`].
98    #[inline]
99    pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
100        let child = Self::get_child_type(&dtype).clone();
101        Self::new(
102            dtype,
103            Offsets::new_zeroed(length).into(),
104            new_empty_array(child),
105            Some(Bitmap::new_zeroed(length)),
106        )
107    }
108
109    pub fn into_inner(
110        self,
111    ) -> (
112        ArrowDataType,
113        Box<dyn Array>,
114        OffsetsBuffer<O>,
115        Option<Bitmap>,
116    ) {
117        (self.dtype, self.values, self.offsets, self.validity)
118    }
119}
120
121impl<O: Offset> ListArray<O> {
122    /// Slices this [`ListArray`].
123    /// # Panics
124    /// panics iff `offset + length > self.len()`
125    pub fn slice(&mut self, offset: usize, length: usize) {
126        assert!(
127            offset + length <= self.len(),
128            "the offset of the new Buffer cannot exceed the existing length"
129        );
130        unsafe { self.slice_unchecked(offset, length) }
131    }
132
133    /// Slices this [`ListArray`].
134    ///
135    /// # Safety
136    /// The caller must ensure that `offset + length < self.len()`.
137    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
138        self.validity = self
139            .validity
140            .take()
141            .map(|bitmap| bitmap.sliced_unchecked(offset, length))
142            .filter(|bitmap| bitmap.unset_bits() > 0);
143        self.offsets.slice_unchecked(offset, length + 1);
144    }
145
146    impl_sliced!();
147    impl_mut_validity!();
148    impl_into_array!();
149}
150
151// Accessors
152impl<O: Offset> ListArray<O> {
153    /// Returns the length of this array
154    #[inline]
155    pub fn len(&self) -> usize {
156        self.offsets.len_proxy()
157    }
158
159    /// Returns the element at index `i`
160    /// # Panic
161    /// Panics iff `i >= self.len()`
162    #[inline]
163    pub fn value(&self, i: usize) -> Box<dyn Array> {
164        assert!(i < self.len());
165        // SAFETY: invariant of this function
166        unsafe { self.value_unchecked(i) }
167    }
168
169    /// Returns the element at index `i` as &str
170    ///
171    /// # Safety
172    /// Assumes that the `i < self.len`.
173    #[inline]
174    pub unsafe fn value_unchecked(&self, i: usize) -> Box<dyn Array> {
175        // SAFETY: the invariant of the function
176        let (start, end) = self.offsets.start_end_unchecked(i);
177        let length = end - start;
178
179        // SAFETY: the invariant of the struct
180        self.values.sliced_unchecked(start, length)
181    }
182
183    /// The optional validity.
184    #[inline]
185    pub fn validity(&self) -> Option<&Bitmap> {
186        self.validity.as_ref()
187    }
188
189    /// The offsets [`Buffer`].
190    #[inline]
191    pub fn offsets(&self) -> &OffsetsBuffer<O> {
192        &self.offsets
193    }
194
195    /// The values.
196    #[inline]
197    pub fn values(&self) -> &Box<dyn Array> {
198        &self.values
199    }
200}
201
202impl<O: Offset> ListArray<O> {
203    /// Returns a default [`ArrowDataType`]: inner field is named "item" and is nullable
204    pub fn default_datatype(dtype: ArrowDataType) -> ArrowDataType {
205        let field = Box::new(Field::new(LIST_VALUES_NAME, dtype, true));
206        if O::IS_LARGE {
207            ArrowDataType::LargeList(field)
208        } else {
209            ArrowDataType::List(field)
210        }
211    }
212
213    /// Returns a the inner [`Field`]
214    /// # Panics
215    /// Panics iff the logical type is not consistent with this struct.
216    pub fn get_child_field(dtype: &ArrowDataType) -> &Field {
217        Self::try_get_child(dtype).unwrap()
218    }
219
220    /// Returns a the inner [`Field`]
221    /// # Errors
222    /// Panics iff the logical type is not consistent with this struct.
223    pub fn try_get_child(dtype: &ArrowDataType) -> PolarsResult<&Field> {
224        if O::IS_LARGE {
225            match dtype.to_logical_type() {
226                ArrowDataType::LargeList(child) => Ok(child.as_ref()),
227                _ => polars_bail!(ComputeError: "ListArray<i64> expects DataType::LargeList"),
228            }
229        } else {
230            match dtype.to_logical_type() {
231                ArrowDataType::List(child) => Ok(child.as_ref()),
232                _ => polars_bail!(ComputeError: "ListArray<i32> expects DataType::List"),
233            }
234        }
235    }
236
237    /// Returns a the inner [`ArrowDataType`]
238    /// # Panics
239    /// Panics iff the logical type is not consistent with this struct.
240    pub fn get_child_type(dtype: &ArrowDataType) -> &ArrowDataType {
241        Self::get_child_field(dtype).dtype()
242    }
243}
244
245impl<O: Offset> Array for ListArray<O> {
246    impl_common_array!();
247
248    fn validity(&self) -> Option<&Bitmap> {
249        self.validity.as_ref()
250    }
251
252    #[inline]
253    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
254        Box::new(self.clone().with_validity(validity))
255    }
256}
257
258impl<O: Offset> Splitable for ListArray<O> {
259    fn check_bound(&self, offset: usize) -> bool {
260        offset <= self.len()
261    }
262
263    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
264        let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
265        let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
266
267        (
268            Self {
269                dtype: self.dtype.clone(),
270                offsets: lhs_offsets,
271                validity: lhs_validity,
272                values: self.values.clone(),
273            },
274            Self {
275                dtype: self.dtype.clone(),
276                offsets: rhs_offsets,
277                validity: rhs_validity,
278                values: self.values.clone(),
279            },
280        )
281    }
282}