Skip to main content

polars_core/chunked_array/list/
mod.rs

1//! Special list utility methods
2pub(super) mod iterator;
3
4use std::borrow::Cow;
5
6use polars_utils::itertools::Itertools;
7
8use crate::prelude::*;
9
10impl ListChunked {
11    /// Get the inner data type of the list.
12    pub fn inner_dtype(&self) -> &DataType {
13        match self.dtype() {
14            DataType::List(dt) => dt.as_ref(),
15            _ => unreachable!(),
16        }
17    }
18
19    /// # Panics
20    /// Panics if the physical representation of `dtype` differs the physical
21    /// representation of the existing inner `dtype`.
22    pub fn set_inner_dtype(&mut self, dtype: DataType) {
23        assert_eq!(dtype.to_physical(), self.inner_dtype().to_physical());
24        let field = Arc::make_mut(&mut self.field);
25        field.coerce(DataType::List(Box::new(dtype)));
26    }
27
28    pub fn set_fast_explode(&mut self) {
29        self.set_fast_explode_list(true)
30    }
31
32    pub fn _can_fast_explode(&self) -> bool {
33        self.get_fast_explode_list()
34    }
35
36    /// Set the logical type of the [`ListChunked`].
37    ///
38    /// # Safety
39    /// The caller must ensure that the logical type given fits the physical type of the array.
40    pub unsafe fn to_logical(&mut self, inner_dtype: DataType) {
41        debug_assert_eq!(&inner_dtype.to_physical(), self.inner_dtype());
42        let fld = Arc::make_mut(&mut self.field);
43        fld.coerce(DataType::List(Box::new(inner_dtype)))
44    }
45
46    /// Convert the datatype of the list into the physical datatype.
47    pub fn to_physical_repr(&self) -> Cow<'_, ListChunked> {
48        let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else {
49            return Cow::Borrowed(self);
50        };
51
52        let ca = if physical_repr.chunks().len() == 1 && self.chunks().len() > 1 {
53            // Physical repr got rechunked, rechunk self as well.
54            self.rechunk()
55        } else {
56            Cow::Borrowed(self)
57        };
58
59        assert_eq!(ca.chunks().len(), physical_repr.chunks().len());
60
61        let chunks: Vec<_> = ca
62            .downcast_iter()
63            .zip(physical_repr.into_chunks())
64            .map(|(chunk, values)| {
65                LargeListArray::new(
66                    ArrowDataType::LargeList(Box::new(ArrowField::new(
67                        LIST_VALUES_NAME,
68                        values.dtype().clone(),
69                        true,
70                    ))),
71                    chunk.offsets().clone(),
72                    values,
73                    chunk.validity().cloned(),
74                )
75                .to_boxed()
76            })
77            .collect();
78
79        let name = self.name().clone();
80        let dtype = DataType::List(Box::new(self.inner_dtype().to_physical()));
81        Cow::Owned(unsafe { ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
82    }
83
84    /// Convert a non-logical [`ListChunked`] back into a logical [`ListChunked`] without casting.
85    ///
86    /// # Safety
87    ///
88    /// This can lead to invalid memory access in downstream code.
89    pub unsafe fn from_physical_unchecked(
90        &self,
91        to_inner_dtype: DataType,
92    ) -> PolarsResult<ListChunked> {
93        debug_assert!(!self.inner_dtype().is_logical());
94
95        let inner_chunks = self
96            .downcast_iter()
97            .map(|chunk| chunk.values())
98            .cloned()
99            .collect();
100
101        let inner = unsafe {
102            Series::from_chunks_and_dtype_unchecked(
103                PlSmallStr::EMPTY,
104                inner_chunks,
105                self.inner_dtype(),
106            )
107        };
108        let inner = unsafe { inner.from_physical_unchecked(&to_inner_dtype) }?;
109
110        let chunks: Vec<_> = self
111            .downcast_iter()
112            .zip(inner.into_chunks())
113            .map(|(chunk, values)| {
114                LargeListArray::new(
115                    ArrowDataType::LargeList(Box::new(ArrowField::new(
116                        LIST_VALUES_NAME,
117                        values.dtype().clone(),
118                        true,
119                    ))),
120                    chunk.offsets().clone(),
121                    values,
122                    chunk.validity().cloned(),
123                )
124                .to_boxed()
125            })
126            .collect();
127
128        let name = self.name().clone();
129        let dtype = DataType::List(Box::new(to_inner_dtype));
130        Ok(unsafe { ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
131    }
132
133    /// Get the inner values as [`Series`], ignoring the list offsets.
134    pub fn get_inner(&self) -> Series {
135        let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();
136
137        // SAFETY: Data type of arrays matches because they are chunks from the same array.
138        unsafe {
139            Series::from_chunks_and_dtype_unchecked(self.name().clone(), chunks, self.inner_dtype())
140        }
141    }
142
143    pub fn inner_length(&self) -> usize {
144        self.downcast_iter().map(|c| c.values().len()).sum()
145    }
146
147    /// Ignore the list indices and apply `func` to the inner type as [`Series`].
148    pub fn apply_to_inner(
149        &self,
150        func: &dyn Fn(Series) -> PolarsResult<Series>,
151    ) -> PolarsResult<ListChunked> {
152        // generated Series will have wrong length otherwise.
153        let ca = self.rechunk();
154        let arr = ca.downcast_as_array();
155
156        // SAFETY:
157        // Inner dtype is passed correctly
158        let elements = unsafe {
159            Series::from_chunks_and_dtype_unchecked(
160                self.name().clone(),
161                vec![arr.values().clone()],
162                ca.inner_dtype(),
163            )
164        };
165
166        let expected_len = elements.len();
167        let out: Series = func(elements)?;
168        polars_ensure!(
169            out.len() == expected_len,
170            ComputeError: "the function should apply element-wise, it removed elements instead"
171        );
172        let out = out.rechunk();
173        let values = out.chunks()[0].clone();
174
175        let inner_dtype = LargeListArray::default_datatype(values.dtype().clone());
176        let arr = LargeListArray::new(
177            inner_dtype,
178            (*arr.offsets()).clone(),
179            values,
180            arr.validity().cloned(),
181        );
182
183        // SAFETY: arr's inner dtype is derived from out dtype.
184        Ok(unsafe {
185            ListChunked::from_chunks_and_dtype_unchecked(
186                ca.name().clone(),
187                vec![Box::new(arr)],
188                DataType::List(Box::new(out.dtype().clone())),
189            )
190        })
191    }
192
193    pub fn with_inner_values(&self, values: &Series) -> ListChunked {
194        if cfg!(debug_assertions) {
195            assert_eq!(values.len(), self.inner_length());
196        }
197
198        // Align the chunks of the lists inner values and the values series.
199        fn align_inner_chunks(ca: &'_ ListChunked, values: &'_ Series) -> Series {
200            if ca.chunks().len() == values.chunks().len()
201                && ca
202                    .downcast_iter()
203                    .map(|arr| arr.values().len())
204                    .zip(values.chunks().iter().map(|arr| arr.len()))
205                    .all_equal()
206            {
207                return values.clone();
208            }
209
210            let mut values = values.rechunk();
211            let chunks = unsafe { values.chunks_mut() };
212            let mut arr = chunks.pop().unwrap();
213            chunks.extend(ca.downcast_iter().map(|ca_arr| {
214                let chunk;
215                (chunk, arr) = arr.split_at_boxed(ca_arr.values().len());
216                chunk
217            }));
218            assert!(arr.is_empty());
219            values
220        }
221
222        let values = align_inner_chunks(self, values);
223        let values_dtype = values.dtype().clone();
224
225        let chunks = self
226            .downcast_iter()
227            .zip(values.into_chunks())
228            .map(|(ca_arr, v_arr)| {
229                debug_assert_eq!(ca_arr.values().len(), v_arr.len());
230                LargeListArray::new(
231                    LargeListArray::default_datatype(v_arr.dtype().clone()),
232                    (ca_arr.offsets()).clone(),
233                    v_arr,
234                    ca_arr.validity().cloned(),
235                )
236                .to_boxed()
237            })
238            .collect::<Vec<_>>();
239
240        // SAFETY: arr's inner dtype is derived from out dtype.
241        unsafe {
242            ListChunked::from_chunks_and_dtype_unchecked(
243                self.name().clone(),
244                chunks,
245                DataType::List(Box::new(values_dtype)),
246            )
247        }
248    }
249}