polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::iter::Map;
4use std::sync::Arc;
5
6use arrow::array::*;
7use arrow::bitmap::Bitmap;
8use arrow::compute::concatenate::concatenate_unchecked;
9use polars_compute::filter::filter_with_bitmap;
10
11use crate::prelude::{ChunkTakeUnchecked, *};
12
13pub mod ops;
14#[macro_use]
15pub mod arithmetic;
16pub mod builder;
17pub mod cast;
18pub mod collect;
19pub mod comparison;
20pub mod flags;
21pub mod float;
22pub mod iterator;
23#[cfg(feature = "ndarray")]
24pub(crate) mod ndarray;
25
26#[cfg(feature = "dtype-array")]
27pub(crate) mod array;
28mod binary;
29mod binary_offset;
30mod bitwise;
31#[cfg(feature = "object")]
32mod drop;
33mod from;
34mod from_iterator;
35pub mod from_iterator_par;
36pub(crate) mod list;
37pub(crate) mod logical;
38#[cfg(feature = "object")]
39pub mod object;
40#[cfg(feature = "random")]
41mod random;
42#[cfg(feature = "dtype-struct")]
43mod struct_;
44#[cfg(any(
45    feature = "temporal",
46    feature = "dtype-datetime",
47    feature = "dtype-date"
48))]
49pub mod temporal;
50mod to_vec;
51mod trusted_len;
52
53use std::slice::Iter;
54
55use arrow::legacy::prelude::*;
56#[cfg(feature = "dtype-struct")]
57pub use struct_::StructChunked;
58
59use self::flags::{StatisticsFlags, StatisticsFlagsIM};
60use crate::series::IsSorted;
61use crate::utils::{first_non_null, last_non_null};
62
63#[cfg(not(feature = "dtype-categorical"))]
64pub struct RevMapping {}
65
66pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
67
68/// # ChunkedArray
69///
70/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
71/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
72/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
73///
74/// ```rust
75/// # use polars_core::prelude::*;
76/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
77///     ca.apply_values(|v| v.cos())
78/// }
79/// ```
80///
81/// ## Conversion between Series and ChunkedArrays
82/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
83///
84/// ```rust
85/// # use polars_core::prelude::*;
86/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
87///     series.i32()
88/// }
89///
90/// fn to_series(ca: Int32Chunked) -> Series {
91///     ca.into_series()
92/// }
93/// ```
94///
95/// # Iterators
96///
97/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
98/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
99/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
100///
101/// ```rust
102/// # use polars_core::prelude::*;
103///
104/// fn iter_forward(ca: &Float32Chunked) {
105///     ca.iter()
106///         .for_each(|opt_v| println!("{:?}", opt_v))
107/// }
108///
109/// fn iter_backward(ca: &Float32Chunked) {
110///     ca.iter()
111///         .rev()
112///         .for_each(|opt_v| println!("{:?}", opt_v))
113/// }
114/// ```
115///
116/// # Memory layout
117///
118/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
119/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
120///
121/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
122/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
123///
124/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
125/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
126/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
127///
128/// If you want to have predictable performance
129/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
130/// multiple append operations.
131///
132/// See also [`ChunkedArray::extend`] for appends within a chunk.
133///
134/// # Invariants
135/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
136/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
137///   chunks.
138/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
139///   logical type given by the datatype.
140///
141/// [`List`]: crate::datatypes::DataType::List
142pub struct ChunkedArray<T: PolarsDataType> {
143    pub(crate) field: Arc<Field>,
144    pub(crate) chunks: Vec<ArrayRef>,
145
146    pub(crate) flags: StatisticsFlagsIM,
147
148    length: usize,
149    null_count: usize,
150    _pd: std::marker::PhantomData<T>,
151}
152
153impl<T: PolarsDataType> ChunkedArray<T> {
154    fn should_rechunk(&self) -> bool {
155        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
156    }
157
158    fn optional_rechunk(mut self) -> Self {
159        // Rechunk if we have many small chunks.
160        if self.should_rechunk() {
161            self.rechunk_mut()
162        }
163        self
164    }
165
166    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
167        self
168    }
169
170    /// Series to [`ChunkedArray<T>`]
171    pub fn unpack_series_matching_type<'a>(
172        &self,
173        series: &'a Series,
174    ) -> PolarsResult<&'a ChunkedArray<T>> {
175        polars_ensure!(
176            self.dtype() == series.dtype(),
177            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
178            series.dtype(),
179            self.dtype(),
180        );
181
182        // SAFETY: dtype will be correct.
183        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
184    }
185
186    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
187    ///
188    /// If you want to explicitly the `length` and `null_count`, look at
189    /// [`ChunkedArray::new_with_dims`]
190    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
191        unsafe {
192            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
193            chunked_arr.compute_len();
194            chunked_arr
195        }
196    }
197
198    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
199    /// # Safety
200    /// The length and null_count must be correct.
201    pub unsafe fn new_with_dims(
202        field: Arc<Field>,
203        chunks: Vec<ArrayRef>,
204        length: usize,
205        null_count: usize,
206    ) -> Self {
207        Self {
208            field,
209            chunks,
210            flags: StatisticsFlagsIM::empty(),
211
212            _pd: Default::default(),
213            length,
214            null_count,
215        }
216    }
217
218    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
219        self.get_flags().is_sorted_ascending()
220    }
221
222    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
223        self.get_flags().is_sorted_descending()
224    }
225
226    /// Whether `self` is sorted in any direction.
227    pub(crate) fn is_sorted_any(&self) -> bool {
228        self.get_flags().is_sorted_any()
229    }
230
231    pub fn unset_fast_explode_list(&mut self) {
232        self.set_fast_explode_list(false)
233    }
234
235    pub fn set_fast_explode_list(&mut self, value: bool) {
236        let mut flags = self.flags.get_mut();
237        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
238        self.flags.set_mut(flags);
239    }
240
241    pub fn get_fast_explode_list(&self) -> bool {
242        self.get_flags().can_fast_explode_list()
243    }
244
245    pub fn get_flags(&self) -> StatisticsFlags {
246        self.flags.get()
247    }
248
249    /// Set flags for the [`ChunkedArray`]
250    pub fn set_flags(&mut self, flags: StatisticsFlags) {
251        self.flags = StatisticsFlagsIM::new(flags);
252    }
253
254    pub fn is_sorted_flag(&self) -> IsSorted {
255        self.get_flags().is_sorted()
256    }
257
258    pub fn retain_flags_from<U: PolarsDataType>(
259        &mut self,
260        from: &ChunkedArray<U>,
261        retain_flags: StatisticsFlags,
262    ) {
263        let flags = from.flags.get();
264        // Try to avoid write contention.
265        if !flags.is_empty() {
266            self.set_flags(flags & retain_flags)
267        }
268    }
269
270    /// Set the 'sorted' bit meta info.
271    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
272        let mut flags = self.flags.get_mut();
273        flags.set_sorted(sorted);
274        self.flags.set_mut(flags);
275    }
276
277    /// Set the 'sorted' bit meta info.
278    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
279        let mut out = self.clone();
280        out.set_sorted_flag(sorted);
281        out
282    }
283
284    /// Get the index of the first non null value in this [`ChunkedArray`].
285    pub fn first_non_null(&self) -> Option<usize> {
286        if self.null_count() == self.len() {
287            None
288        }
289        // We now know there is at least 1 non-null item in the array, and self.len() > 0
290        else if self.null_count() == 0 {
291            Some(0)
292        } else if self.is_sorted_any() {
293            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
294                // nulls are all at the start
295                self.null_count()
296            } else {
297                // nulls are all at the end
298                0
299            };
300
301            debug_assert!(
302                // If we are lucky this catches something.
303                unsafe { self.get_unchecked(out) }.is_some(),
304                "incorrect sorted flag"
305            );
306
307            Some(out)
308        } else {
309            first_non_null(self.iter_validities())
310        }
311    }
312
313    /// Get the index of the last non null value in this [`ChunkedArray`].
314    pub fn last_non_null(&self) -> Option<usize> {
315        if self.null_count() == self.len() {
316            None
317        }
318        // We now know there is at least 1 non-null item in the array, and self.len() > 0
319        else if self.null_count() == 0 {
320            Some(self.len() - 1)
321        } else if self.is_sorted_any() {
322            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
323                // nulls are all at the start
324                self.len() - 1
325            } else {
326                // nulls are all at the end
327                self.len() - self.null_count() - 1
328            };
329
330            debug_assert!(
331                // If we are lucky this catches something.
332                unsafe { self.get_unchecked(out) }.is_some(),
333                "incorrect sorted flag"
334            );
335
336            Some(out)
337        } else {
338            last_non_null(self.iter_validities(), self.len())
339        }
340    }
341
342    pub fn drop_nulls(&self) -> Self {
343        if self.null_count() == 0 {
344            self.clone()
345        } else {
346            let chunks = self
347                .downcast_iter()
348                .map(|arr| {
349                    if arr.null_count() == 0 {
350                        arr.to_boxed()
351                    } else {
352                        filter_with_bitmap(arr, arr.validity().unwrap())
353                    }
354                })
355                .collect();
356            unsafe {
357                Self::new_with_dims(
358                    self.field.clone(),
359                    chunks,
360                    self.len() - self.null_count(),
361                    0,
362                )
363            }
364        }
365    }
366
367    /// Get the buffer of bits representing null values
368    #[inline]
369    #[allow(clippy::type_complexity)]
370    pub fn iter_validities(&self) -> Map<Iter<'_, ArrayRef>, fn(&ArrayRef) -> Option<&Bitmap>> {
371        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
372            arr.validity()
373        }
374        self.chunks.iter().map(to_validity)
375    }
376
377    #[inline]
378    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
379    pub fn has_nulls(&self) -> bool {
380        self.null_count > 0
381    }
382
383    /// Shrink the capacity of this array to fit its length.
384    pub fn shrink_to_fit(&mut self) {
385        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
386    }
387
388    pub fn clear(&self) -> Self {
389        // SAFETY: we keep the correct dtype
390        let mut ca = unsafe {
391            self.copy_with_chunks(vec![new_empty_array(
392                self.chunks.first().unwrap().dtype().clone(),
393            )])
394        };
395
396        use StatisticsFlags as F;
397        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
398        ca
399    }
400
401    /// Unpack a [`Series`] to the same physical type.
402    ///
403    /// # Safety
404    ///
405    /// This is unsafe as the dtype may be incorrect and
406    /// is assumed to be correct in other safe code.
407    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
408        &self,
409        series: &'a Series,
410    ) -> &'a ChunkedArray<T> {
411        let series_trait = &**series;
412        if self.dtype() == series.dtype() {
413            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
414        } else {
415            use DataType::*;
416            match (self.dtype(), series.dtype()) {
417                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
418                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
419                },
420                _ => panic!(
421                    "cannot unpack series {:?} into matching type {:?}",
422                    series,
423                    self.dtype()
424                ),
425            }
426        }
427    }
428
429    /// Returns an iterator over the lengths of the chunks of the array.
430    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
431        self.chunks.iter().map(|chunk| chunk.len())
432    }
433
434    /// A reference to the chunks
435    #[inline]
436    pub fn chunks(&self) -> &Vec<ArrayRef> {
437        &self.chunks
438    }
439
440    /// A mutable reference to the chunks
441    ///
442    /// # Safety
443    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
444    /// And the `null_count` remains correct.
445    #[inline]
446    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
447        &mut self.chunks
448    }
449
450    /// Returns true if contains a single chunk and has no null values
451    pub fn is_optimal_aligned(&self) -> bool {
452        self.chunks.len() == 1 && self.null_count() == 0
453    }
454
455    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
456    ///
457    /// # Safety
458    /// The caller must ensure the dtypes of the chunks are correct
459    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
460        Self::new_with_compute_len(self.field.clone(), chunks)
461    }
462
463    /// Get data type of [`ChunkedArray`].
464    pub fn dtype(&self) -> &DataType {
465        self.field.dtype()
466    }
467
468    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
469        self.field = Arc::new(Field::new(self.name().clone(), dtype))
470    }
471
472    /// Name of the [`ChunkedArray`].
473    pub fn name(&self) -> &PlSmallStr {
474        self.field.name()
475    }
476
477    /// Get a reference to the field.
478    pub fn ref_field(&self) -> &Field {
479        &self.field
480    }
481
482    /// Rename this [`ChunkedArray`].
483    pub fn rename(&mut self, name: PlSmallStr) {
484        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
485    }
486
487    /// Return this [`ChunkedArray`] with a new name.
488    pub fn with_name(mut self, name: PlSmallStr) -> Self {
489        self.rename(name);
490        self
491    }
492}
493
494impl<T> ChunkedArray<T>
495where
496    T: PolarsDataType,
497{
498    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
499    /// indicates a NULL value.
500    ///
501    /// # Panics
502    /// This function will panic if `idx` is out of bounds.
503    #[inline]
504    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
505        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
506        assert!(
507            chunk_idx < self.chunks().len(),
508            "index: {} out of bounds for len: {}",
509            idx,
510            self.len()
511        );
512        unsafe {
513            let arr = self.downcast_get_unchecked(chunk_idx);
514            assert!(
515                arr_idx < arr.len(),
516                "index: {} out of bounds for len: {}",
517                idx,
518                self.len()
519            );
520            arr.get_unchecked(arr_idx)
521        }
522    }
523
524    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
525    /// indicates a NULL value.
526    ///
527    /// # Safety
528    /// It is the callers responsibility that the `idx < self.len()`.
529    #[inline]
530    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
531        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
532
533        unsafe {
534            // SAFETY: up to the caller to make sure the index is valid.
535            self.downcast_get_unchecked(chunk_idx)
536                .get_unchecked(arr_idx)
537        }
538    }
539
540    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
541    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
542    ///
543    /// # Safety
544    /// It is the callers responsibility that the `idx < self.len()`.
545    #[inline]
546    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
547        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
548
549        unsafe {
550            // SAFETY: up to the caller to make sure the index is valid.
551            self.downcast_get_unchecked(chunk_idx)
552                .value_unchecked(arr_idx)
553        }
554    }
555
556    #[inline]
557    pub fn first(&self) -> Option<T::Physical<'_>> {
558        unsafe {
559            let arr = self.downcast_get_unchecked(0);
560            arr.get_unchecked(0)
561        }
562    }
563
564    #[inline]
565    pub fn last(&self) -> Option<T::Physical<'_>> {
566        unsafe {
567            let arr = self.downcast_get_unchecked(self.chunks.len().checked_sub(1)?);
568            arr.get_unchecked(arr.len().checked_sub(1)?)
569        }
570    }
571
572    pub fn set_validity(&mut self, validity: &Bitmap) {
573        assert_eq!(self.len(), validity.len());
574        let mut i = 0;
575        for chunk in unsafe { self.chunks_mut() } {
576            *chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
577            i += chunk.len();
578        }
579        self.null_count = validity.unset_bits();
580        self.set_fast_explode_list(false);
581    }
582}
583
584impl<T> ChunkedArray<T>
585where
586    T: PolarsDataType,
587    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
588{
589    /// Deposit values into nulls with a certain validity mask.
590    pub fn deposit(&self, validity: &Bitmap) -> Self {
591        let set_bits = validity.set_bits();
592
593        assert_eq!(self.null_count(), 0);
594        assert_eq!(self.len(), set_bits);
595
596        if set_bits == validity.len() {
597            return self.clone();
598        }
599
600        if set_bits == 0 {
601            return Self::full_null_like(self, validity.len());
602        }
603
604        let mut null_mask = validity.clone();
605
606        let mut gather_idxs = Vec::with_capacity(validity.len());
607        let leading_nulls = null_mask.take_leading_zeros();
608        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
609
610        let mut i = 0 as IdxSize;
611        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
612            i += IdxSize::from(v);
613            i
614        }));
615
616        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
617        ca.set_validity(validity);
618        ca
619    }
620}
621
622impl ListChunked {
623    #[inline]
624    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
625        unsafe {
626            Some(Series::from_chunks_and_dtype_unchecked(
627                self.name().clone(),
628                vec![self.get(idx)?],
629                &self.inner_dtype().to_physical(),
630            ))
631        }
632    }
633
634    pub fn has_masked_out_values(&self) -> bool {
635        for arr in self.downcast_iter() {
636            if arr.is_empty() {
637                continue;
638            }
639
640            if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
641                return true;
642            }
643
644            let Some(validity) = arr.validity() else {
645                continue;
646            };
647            if validity.set_bits() == 0 {
648                continue;
649            }
650
651            // @Performance: false_idx_iter
652            for i in (!validity).true_idx_iter() {
653                if arr.offsets().length_at(i) > 0 {
654                    return true;
655                }
656            }
657        }
658
659        false
660    }
661}
662
663#[cfg(feature = "dtype-array")]
664impl ArrayChunked {
665    #[inline]
666    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
667        unsafe {
668            Some(Series::from_chunks_and_dtype_unchecked(
669                self.name().clone(),
670                vec![self.get(idx)?],
671                &self.inner_dtype().to_physical(),
672            ))
673        }
674    }
675
676    pub fn from_aligned_values(
677        name: PlSmallStr,
678        inner_dtype: &DataType,
679        width: usize,
680        chunks: Vec<ArrayRef>,
681        length: usize,
682    ) -> Self {
683        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
684        let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
685        let field = Arc::new(Field::new(name, dtype));
686        if width == 0 {
687            use arrow::array::builder::{ArrayBuilder, make_builder};
688            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
689            return ArrayChunked::new_with_compute_len(
690                field,
691                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
692            );
693        }
694
695        let chunks = chunks
696            .into_iter()
697            .map(|chunk| {
698                debug_assert_eq!(chunk.len() % width, 0);
699                FixedSizeListArray::new(arrow_dtype.clone(), length, chunk, None).into_boxed()
700            })
701            .collect();
702
703        unsafe { Self::new_with_dims(field, chunks, length, 0) }
704    }
705
706    /// Turn the ArrayChunked into the ListChunked with the same items.
707    ///
708    /// This will always zero copy the values into the ListChunked.
709    pub fn to_list(&self) -> ListChunked {
710        let inner_dtype = self.inner_dtype();
711        let chunks = self
712            .downcast_iter()
713            .map(|chunk| {
714                use arrow::offset::OffsetsBuffer;
715
716                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
717                let dtype = inner_dtype.clone().to_large_list(true);
718
719                let offsets = (0..=chunk.len())
720                    .map(|i| (i * self.width()) as i64)
721                    .collect::<Vec<i64>>();
722
723                // SAFETY: We created our offsets in ascending manner.
724                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
725
726                ListArray::<i64>::new(
727                    dtype,
728                    offsets,
729                    chunk.values().clone(),
730                    chunk.validity().cloned(),
731                )
732                .into_boxed()
733            })
734            .collect();
735
736        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
737        let mut ca = unsafe {
738            ListChunked::new_with_dims(
739                Arc::new(Field::new(
740                    self.name().clone(),
741                    DataType::List(Box::new(inner_dtype.clone())),
742                )),
743                chunks,
744                self.len(),
745                self.null_count(),
746            )
747        };
748        ca.set_fast_explode_list(!self.has_nulls());
749        ca
750    }
751}
752
753impl<T> ChunkedArray<T>
754where
755    T: PolarsDataType,
756{
757    /// Should be used to match the chunk_id of another [`ChunkedArray`].
758    /// # Panics
759    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
760    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
761    where
762        I: Iterator<Item = usize>,
763    {
764        debug_assert!(self.chunks.len() == 1);
765        // Takes a ChunkedArray containing a single chunk.
766        let slice = |ca: &Self| {
767            let array = &ca.chunks[0];
768
769            let mut offset = 0;
770            let chunks = chunk_id
771                .map(|len| {
772                    // SAFETY: within bounds.
773                    debug_assert!((offset + len) <= array.len());
774                    let out = unsafe { array.sliced_unchecked(offset, len) };
775                    offset += len;
776                    out
777                })
778                .collect();
779
780            debug_assert_eq!(offset, array.len());
781
782            // SAFETY: We just slice the original chunks, their type will not change.
783            unsafe {
784                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
785            }
786        };
787
788        if self.chunks.len() != 1 {
789            let out = self.rechunk();
790            slice(&out)
791        } else {
792            slice(self)
793        }
794    }
795}
796
797impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
798    fn as_ref_dtype(&self) -> &DataType {
799        self.dtype()
800    }
801}
802
803pub(crate) trait AsSinglePtr: AsRefDataType {
804    /// Rechunk and return a ptr to the start of the array
805    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
806        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
807    }
808}
809
810impl<T> AsSinglePtr for ChunkedArray<T>
811where
812    T: PolarsNumericType,
813{
814    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
815        self.rechunk_mut();
816        let a = self.data_views().next().unwrap();
817        let ptr = a.as_ptr();
818        Ok(ptr as usize)
819    }
820}
821
822impl AsSinglePtr for BooleanChunked {}
823impl AsSinglePtr for ListChunked {}
824#[cfg(feature = "dtype-array")]
825impl AsSinglePtr for ArrayChunked {}
826impl AsSinglePtr for StringChunked {}
827impl AsSinglePtr for BinaryChunked {}
828#[cfg(feature = "object")]
829impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
830
831pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
832    SingleNoNull(&'a T::Array),
833    Single(&'a T::Array),
834    MultiNoNull(&'a ChunkedArray<T>),
835    Multi(&'a ChunkedArray<T>),
836}
837
838impl<T> ChunkedArray<T>
839where
840    T: PolarsDataType,
841{
842    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
843        if self.chunks.len() == 1 {
844            let arr = self.downcast_iter().next().unwrap();
845            return if arr.null_count() == 0 {
846                ChunkedArrayLayout::SingleNoNull(arr)
847            } else {
848                ChunkedArrayLayout::Single(arr)
849            };
850        }
851
852        if self.downcast_iter().all(|a| a.null_count() == 0) {
853            ChunkedArrayLayout::MultiNoNull(self)
854        } else {
855            ChunkedArrayLayout::Multi(self)
856        }
857    }
858}
859
860impl<T> ChunkedArray<T>
861where
862    T: PolarsNumericType,
863{
864    /// Returns the values of the array as a contiguous slice.
865    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
866        polars_ensure!(
867            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
868            ComputeError: "chunked array is not contiguous"
869        );
870        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
871    }
872
873    /// Returns the values of the array as a contiguous mutable slice.
874    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
875        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
876            // SAFETY, we will not swap the PrimitiveArray.
877            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
878            arr.get_mut_values()
879        } else {
880            None
881        }
882    }
883
884    /// Get slices of the underlying arrow data.
885    /// NOTE: null values should be taken into account by the user of these slices as they are handled
886    /// separately
887    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
888        self.downcast_iter().map(|arr| arr.values().as_slice())
889    }
890
891    #[allow(clippy::wrong_self_convention)]
892    pub fn into_no_null_iter(
893        &self,
894    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
895    {
896        // .copied was significantly slower in benchmark, next call did not inline?
897        #[allow(clippy::map_clone)]
898        // we know the iterators len
899        unsafe {
900            self.data_views()
901                .flatten()
902                .map(|v| *v)
903                .trust_my_length(self.len())
904        }
905    }
906}
907
908impl<T: PolarsDataType> Clone for ChunkedArray<T> {
909    fn clone(&self) -> Self {
910        ChunkedArray {
911            field: self.field.clone(),
912            chunks: self.chunks.clone(),
913            flags: self.flags.clone(),
914
915            _pd: Default::default(),
916            length: self.length,
917            null_count: self.null_count,
918        }
919    }
920}
921
922impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
923    fn as_ref(&self) -> &ChunkedArray<T> {
924        self
925    }
926}
927
928impl ValueSize for ListChunked {
929    fn get_values_size(&self) -> usize {
930        self.chunks
931            .iter()
932            .fold(0usize, |acc, arr| acc + arr.get_values_size())
933    }
934}
935
936#[cfg(feature = "dtype-array")]
937impl ValueSize for ArrayChunked {
938    fn get_values_size(&self) -> usize {
939        self.chunks
940            .iter()
941            .fold(0usize, |acc, arr| acc + arr.get_values_size())
942    }
943}
944impl ValueSize for StringChunked {
945    fn get_values_size(&self) -> usize {
946        self.chunks
947            .iter()
948            .fold(0usize, |acc, arr| acc + arr.get_values_size())
949    }
950}
951
952impl ValueSize for BinaryOffsetChunked {
953    fn get_values_size(&self) -> usize {
954        self.chunks
955            .iter()
956            .fold(0usize, |acc, arr| acc + arr.get_values_size())
957    }
958}
959
960pub(crate) fn to_primitive<T: PolarsNumericType>(
961    values: Vec<T::Native>,
962    validity: Option<Bitmap>,
963) -> PrimitiveArray<T::Native> {
964    PrimitiveArray::new(
965        T::get_static_dtype().to_arrow(CompatLevel::newest()),
966        values.into(),
967        validity,
968    )
969}
970
971pub(crate) fn to_array<T: PolarsNumericType>(
972    values: Vec<T::Native>,
973    validity: Option<Bitmap>,
974) -> ArrayRef {
975    Box::new(to_primitive::<T>(values, validity))
976}
977
978impl<T: PolarsDataType> Default for ChunkedArray<T> {
979    fn default() -> Self {
980        let dtype = T::get_static_dtype();
981        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
982        ChunkedArray {
983            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
984            // Invariant: always has 1 chunk.
985            chunks: vec![new_empty_array(arrow_dtype)],
986            flags: StatisticsFlagsIM::empty(),
987
988            _pd: Default::default(),
989            length: 0,
990            null_count: 0,
991        }
992    }
993}
994
995#[cfg(test)]
996pub(crate) mod test {
997    use crate::prelude::*;
998
999    pub(crate) fn get_chunked_array() -> Int32Chunked {
1000        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1001    }
1002
1003    #[test]
1004    fn test_sort() {
1005        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1006        let b = a
1007            .sort(false)
1008            .into_iter()
1009            .map(|opt| opt.unwrap())
1010            .collect::<Vec<_>>();
1011        assert_eq!(b, [1, 2, 3, 9]);
1012        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1013        let a = a.sort(false);
1014        let b = a.into_iter().collect::<Vec<_>>();
1015        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1016        assert!(a.is_sorted_ascending_flag());
1017    }
1018
1019    #[test]
1020    fn arithmetic() {
1021        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1022        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1023
1024        // Not really asserting anything here but still making sure the code is exercised
1025        // This (and more) is properly tested from the integration test suite and Python bindings.
1026        println!("{:?}", a + b);
1027        println!("{:?}", a - b);
1028        println!("{:?}", a * b);
1029        println!("{:?}", a / b);
1030    }
1031
1032    #[test]
1033    fn iter() {
1034        let s1 = get_chunked_array();
1035        // sum
1036        assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1037    }
1038
1039    #[test]
1040    fn limit() {
1041        let a = get_chunked_array();
1042        let b = a.limit(2);
1043        println!("{b:?}");
1044        assert_eq!(b.len(), 2)
1045    }
1046
1047    #[test]
1048    fn filter() {
1049        let a = get_chunked_array();
1050        let b = a
1051            .filter(&BooleanChunked::new(
1052                PlSmallStr::from_static("filter"),
1053                &[true, false, false],
1054            ))
1055            .unwrap();
1056        assert_eq!(b.len(), 1);
1057        assert_eq!(b.into_iter().next(), Some(Some(1)));
1058    }
1059
1060    #[test]
1061    fn aggregates() {
1062        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1063        assert_eq!(a.max(), Some(100));
1064        assert_eq!(a.min(), Some(1));
1065        assert_eq!(a.sum(), Some(120))
1066    }
1067
1068    #[test]
1069    fn take() {
1070        let a = get_chunked_array();
1071        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1072        assert_eq!(new.len(), 2)
1073    }
1074
1075    #[test]
1076    fn cast() {
1077        let a = get_chunked_array();
1078        let b = a.cast(&DataType::Int64).unwrap();
1079        assert_eq!(b.dtype(), &DataType::Int64)
1080    }
1081
1082    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1083    where
1084        T: PolarsNumericType,
1085    {
1086        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1087    }
1088
1089    #[test]
1090    fn slice() {
1091        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1092        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1093        first.append(&second).unwrap();
1094        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1095        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1096        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1097        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1098        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1099        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1100        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1101
1102        assert_eq!(first.slice(-7, 2).len(), 1);
1103        assert_eq!(first.slice(-3, 4).len(), 3);
1104        assert_eq!(first.slice(3, 4).len(), 3);
1105        assert_eq!(first.slice(10, 4).len(), 0);
1106    }
1107
1108    #[test]
1109    fn sorting() {
1110        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1111        let sorted = s.sort(false);
1112        assert_slice_equal(&sorted, &[2, 4, 9]);
1113        let sorted = s.sort(true);
1114        assert_slice_equal(&sorted, &[9, 4, 2]);
1115
1116        let s: StringChunked = ["b", "a", "z"].iter().collect();
1117        let sorted = s.sort(false);
1118        assert_eq!(
1119            sorted.into_iter().collect::<Vec<_>>(),
1120            &[Some("a"), Some("b"), Some("z")]
1121        );
1122        let sorted = s.sort(true);
1123        assert_eq!(
1124            sorted.into_iter().collect::<Vec<_>>(),
1125            &[Some("z"), Some("b"), Some("a")]
1126        );
1127        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1128        let sorted = s.sort(false);
1129        assert_eq!(
1130            sorted.into_iter().collect::<Vec<_>>(),
1131            &[None, Some("b"), Some("z")]
1132        );
1133    }
1134
1135    #[test]
1136    fn reverse() {
1137        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1138        // path with continuous slice
1139        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1140        // path with options
1141        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1142        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1143        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1144        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1145
1146        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1147        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1148
1149        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1150        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1151    }
1152
1153    #[test]
1154    #[cfg(feature = "dtype-categorical")]
1155    fn test_iter_categorical() {
1156        let ca = StringChunked::new(
1157            PlSmallStr::EMPTY,
1158            &[Some("foo"), None, Some("bar"), Some("ham")],
1159        );
1160        let cats = Categories::new(
1161            PlSmallStr::EMPTY,
1162            PlSmallStr::EMPTY,
1163            CategoricalPhysical::U32,
1164        );
1165        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1166        let ca = ca.cat32().unwrap();
1167        let v: Vec<_> = ca.physical().into_iter().collect();
1168        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1169    }
1170
1171    #[test]
1172    #[ignore]
1173    fn test_shrink_to_fit() {
1174        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1175        builder.append_value("foo");
1176        let mut arr = builder.finish();
1177        let before = arr
1178            .chunks()
1179            .iter()
1180            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1181            .sum::<usize>();
1182        arr.shrink_to_fit();
1183        let after = arr
1184            .chunks()
1185            .iter()
1186            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1187            .sum::<usize>();
1188        assert!(before > after);
1189    }
1190}