polars_core/chunked_array/ops/
chunkops.rs

1use std::cell::Cell;
2
3use arrow::bitmap::{Bitmap, BitmapBuilder};
4use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked;
5use polars_error::constants::LENGTH_LIMIT_MSG;
6
7use super::*;
8use crate::chunked_array::flags::StatisticsFlags;
9#[cfg(feature = "object")]
10use crate::chunked_array::object::builder::ObjectChunkedBuilder;
11use crate::utils::slice_offsets;
12
13pub(crate) fn split_at(
14    chunks: &[ArrayRef],
15    offset: i64,
16    own_length: usize,
17) -> (Vec<ArrayRef>, Vec<ArrayRef>) {
18    let mut new_chunks_left = Vec::with_capacity(1);
19    let mut new_chunks_right = Vec::with_capacity(1);
20    let (raw_offset, _) = slice_offsets(offset, 0, own_length);
21
22    let mut remaining_offset = raw_offset;
23    let mut iter = chunks.iter();
24
25    for chunk in &mut iter {
26        let chunk_len = chunk.len();
27        if remaining_offset > 0 && remaining_offset >= chunk_len {
28            remaining_offset -= chunk_len;
29            new_chunks_left.push(chunk.clone());
30            continue;
31        }
32
33        let (l, r) = chunk.split_at_boxed(remaining_offset);
34        new_chunks_left.push(l);
35        new_chunks_right.push(r);
36        break;
37    }
38
39    for chunk in iter {
40        new_chunks_right.push(chunk.clone())
41    }
42    if new_chunks_left.is_empty() {
43        new_chunks_left.push(chunks[0].sliced(0, 0));
44    }
45    if new_chunks_right.is_empty() {
46        new_chunks_right.push(chunks[0].sliced(0, 0));
47    }
48    (new_chunks_left, new_chunks_right)
49}
50
51pub(crate) fn slice(
52    chunks: &[ArrayRef],
53    offset: i64,
54    slice_length: usize,
55    own_length: usize,
56) -> (Vec<ArrayRef>, usize) {
57    let mut new_chunks = Vec::with_capacity(1);
58    let (raw_offset, slice_len) = slice_offsets(offset, slice_length, own_length);
59
60    let mut remaining_length = slice_len;
61    let mut remaining_offset = raw_offset;
62    let mut new_len = 0;
63
64    for chunk in chunks {
65        let chunk_len = chunk.len();
66        if remaining_offset > 0 && remaining_offset >= chunk_len {
67            remaining_offset -= chunk_len;
68            continue;
69        }
70        let take_len = if remaining_length + remaining_offset > chunk_len {
71            chunk_len - remaining_offset
72        } else {
73            remaining_length
74        };
75        new_len += take_len;
76
77        debug_assert!(remaining_offset + take_len <= chunk.len());
78        unsafe {
79            // SAFETY:
80            // this function ensures the slices are in bounds
81            new_chunks.push(chunk.sliced_unchecked(remaining_offset, take_len));
82        }
83        remaining_length -= take_len;
84        remaining_offset = 0;
85        if remaining_length == 0 {
86            break;
87        }
88    }
89    if new_chunks.is_empty() {
90        new_chunks.push(chunks[0].sliced(0, 0));
91    }
92    (new_chunks, new_len)
93}
94
95// When we deal with arrays and lists we can easily exceed the limit if
96// we take the underlying values array as a Series. This call stack
97// is hard to follow, so for this one case we make an exception
98// and use a thread local.
99thread_local!(static CHECK_LENGTH: Cell<bool> = const { Cell::new(true) });
100
101/// Meant for internal use. In very rare conditions this can be turned off.
102/// # Safety
103/// The caller must ensure the Series that exceeds the length get's deconstructed
104/// into array values or list values before and never is used.
105pub unsafe fn _set_check_length(check: bool) {
106    CHECK_LENGTH.set(check)
107}
108
109impl<T: PolarsDataType> ChunkedArray<T> {
110    /// Get the length of the ChunkedArray
111    #[inline]
112    pub fn len(&self) -> usize {
113        self.length
114    }
115
116    /// Return the number of null values in the ChunkedArray.
117    #[inline]
118    pub fn null_count(&self) -> usize {
119        self.null_count
120    }
121
122    /// Set the null count directly.
123    ///
124    /// This can be useful after mutably adjusting the validity of the
125    /// underlying arrays.
126    ///
127    /// # Safety
128    /// The new null count must match the total null count of the underlying
129    /// arrays.
130    pub unsafe fn set_null_count(&mut self, null_count: usize) {
131        self.null_count = null_count;
132    }
133
134    /// Check if ChunkedArray is empty.
135    pub fn is_empty(&self) -> bool {
136        self.len() == 0
137    }
138
139    /// Compute the length
140    pub(crate) fn compute_len(&mut self) {
141        fn inner(chunks: &[ArrayRef]) -> usize {
142            match chunks.len() {
143                // fast path
144                1 => chunks[0].len(),
145                _ => chunks.iter().fold(0, |acc, arr| acc + arr.len()),
146            }
147        }
148        let len = inner(&self.chunks);
149        // Length limit is `IdxSize::MAX - 1`. We use `IdxSize::MAX` to indicate `NULL` in indexing.
150        if len >= (IdxSize::MAX as usize) && CHECK_LENGTH.get() {
151            panic!("{}", LENGTH_LIMIT_MSG);
152        }
153        self.length = len;
154        self.null_count = self
155            .chunks
156            .iter()
157            .map(|arr| arr.null_count())
158            .sum::<usize>();
159    }
160
161    pub fn rechunk(&self) -> Self {
162        match self.dtype() {
163            #[cfg(feature = "object")]
164            DataType::Object(_, _) => {
165                panic!("implementation error")
166            },
167            _ => {
168                fn inner_rechunk(chunks: &[ArrayRef]) -> Vec<ArrayRef> {
169                    vec![concatenate_owned_unchecked(chunks).unwrap()]
170                }
171
172                if self.chunks.len() == 1 {
173                    self.clone()
174                } else {
175                    let chunks = inner_rechunk(&self.chunks);
176
177                    let mut ca = unsafe { self.copy_with_chunks(chunks) };
178
179                    use StatisticsFlags as F;
180                    ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
181                    ca
182                }
183            },
184        }
185    }
186
187    pub fn rechunk_validity(&self) -> Option<Bitmap> {
188        if self.chunks.len() == 1 {
189            return self.chunks[0].validity().cloned();
190        }
191
192        if !self.has_nulls() || self.is_empty() {
193            return None;
194        }
195
196        let mut bm = BitmapBuilder::with_capacity(self.len());
197        for arr in self.downcast_iter() {
198            if let Some(v) = arr.validity() {
199                bm.extend_from_bitmap(v);
200            } else {
201                bm.extend_constant(arr.len(), true);
202            }
203        }
204        bm.into_opt_validity()
205    }
206
207    /// Split the array. The chunks are reallocated the underlying data slices are zero copy.
208    ///
209    /// When offset is negative it will be counted from the end of the array.
210    /// This method will never error,
211    /// and will slice the best match when offset, or length is out of bounds
212    pub fn split_at(&self, offset: i64) -> (Self, Self) {
213        // A normal slice, slice the buffers and thus keep the whole memory allocated.
214        let (l, r) = split_at(&self.chunks, offset, self.len());
215        let mut out_l = unsafe { self.copy_with_chunks(l) };
216        let mut out_r = unsafe { self.copy_with_chunks(r) };
217
218        use StatisticsFlags as F;
219        out_l.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
220        out_r.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
221
222        (out_l, out_r)
223    }
224
225    /// Slice the array. The chunks are reallocated the underlying data slices are zero copy.
226    ///
227    /// When offset is negative it will be counted from the end of the array.
228    /// This method will never error,
229    /// and will slice the best match when offset, or length is out of bounds
230    pub fn slice(&self, offset: i64, length: usize) -> Self {
231        // The len: 0 special cases ensure we release memory.
232        // A normal slice, slice the buffers and thus keep the whole memory allocated.
233        let exec = || {
234            let (chunks, len) = slice(&self.chunks, offset, length, self.len());
235            let mut out = unsafe { self.copy_with_chunks(chunks) };
236
237            use StatisticsFlags as F;
238            out.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
239            out.length = len;
240
241            out
242        };
243
244        match length {
245            0 => match self.dtype() {
246                #[cfg(feature = "object")]
247                DataType::Object(_, _) => exec(),
248                _ => self.clear(),
249            },
250            _ => exec(),
251        }
252    }
253
254    /// Take a view of top n elements
255    #[must_use]
256    pub fn limit(&self, num_elements: usize) -> Self
257    where
258        Self: Sized,
259    {
260        self.slice(0, num_elements)
261    }
262
263    /// Get the head of the [`ChunkedArray`]
264    #[must_use]
265    pub fn head(&self, length: Option<usize>) -> Self
266    where
267        Self: Sized,
268    {
269        match length {
270            Some(len) => self.slice(0, std::cmp::min(len, self.len())),
271            None => self.slice(0, std::cmp::min(10, self.len())),
272        }
273    }
274
275    /// Get the tail of the [`ChunkedArray`]
276    #[must_use]
277    pub fn tail(&self, length: Option<usize>) -> Self
278    where
279        Self: Sized,
280    {
281        let len = match length {
282            Some(len) => std::cmp::min(len, self.len()),
283            None => std::cmp::min(10, self.len()),
284        };
285        self.slice(-(len as i64), len)
286    }
287
288    /// Remove empty chunks.
289    pub fn prune_empty_chunks(&mut self) {
290        let mut count = 0u32;
291        unsafe {
292            self.chunks_mut().retain(|arr| {
293                count += 1;
294                // Always keep at least one chunk
295                if count == 1 {
296                    true
297                } else {
298                    // Remove the empty chunks
299                    arr.len() > 0
300                }
301            })
302        }
303    }
304}
305
306#[cfg(feature = "object")]
307impl<T: PolarsObject> ObjectChunked<T> {
308    pub(crate) fn rechunk_object(&self) -> Self {
309        if self.chunks.len() == 1 {
310            self.clone()
311        } else {
312            let mut builder = ObjectChunkedBuilder::new(self.name().clone(), self.len());
313            let chunks = self.downcast_iter();
314
315            // todo! use iterators once implemented
316            // no_null path
317            if !self.has_nulls() {
318                for arr in chunks {
319                    for idx in 0..arr.len() {
320                        builder.append_value(arr.value(idx).clone())
321                    }
322                }
323            } else {
324                for arr in chunks {
325                    for idx in 0..arr.len() {
326                        if arr.is_valid(idx) {
327                            builder.append_value(arr.value(idx).clone())
328                        } else {
329                            builder.append_null()
330                        }
331                    }
332                }
333            }
334            builder.finish()
335        }
336    }
337}
338
339#[cfg(test)]
340mod test {
341    #[cfg(feature = "dtype-categorical")]
342    use crate::prelude::*;
343
344    #[test]
345    #[cfg(feature = "dtype-categorical")]
346    fn test_categorical_map_after_rechunk() {
347        let s = Series::new(PlSmallStr::EMPTY, &["foo", "bar", "spam"]);
348        let mut a = s
349            .cast(&DataType::Categorical(None, Default::default()))
350            .unwrap();
351
352        a.append(&a.slice(0, 2)).unwrap();
353        let a = a.rechunk();
354        assert!(a.categorical().unwrap().get_rev_map().len() > 0);
355    }
356}