polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54    /// Keep the first unique row.
55    First,
56    /// Keep the last unique row.
57    Last,
58    /// Keep None of the unique rows.
59    None,
60    /// Keep any of the unique rows
61    /// This allows more optimizations
62    #[default]
63    Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68    F: for<'a> FnMut(&'a T) -> &'a str,
69{
70    // Always unique.
71    if items.len() <= 1 {
72        return Ok(());
73    }
74
75    if items.len() <= 4 {
76        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77        for i in 0..items.len() - 1 {
78            let name = get_name(&items[i]);
79            for other in items.iter().skip(i + 1) {
80                if name == get_name(other) {
81                    polars_bail!(duplicate = name);
82                }
83            }
84        }
85    } else {
86        let mut names = PlHashSet::with_capacity(items.len());
87        for item in items {
88            let name = get_name(item);
89            if !names.insert(name) {
90                polars_bail!(duplicate = name);
91            }
92        }
93    }
94    Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*;      if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138///                                       "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153///              "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165///              "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173    height: usize,
174    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175    pub(crate) columns: Vec<Column>,
176
177    /// A cached schema. This might not give correct results if the DataFrame was modified in place
178    /// between schema and reading.
179    cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183    pub fn clear_schema(&mut self) {
184        self.cached_schema = OnceLock::new();
185    }
186
187    #[inline]
188    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189        self.columns.iter()
190    }
191
192    #[inline]
193    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194        self.columns.iter().map(Column::as_materialized_series)
195    }
196
197    #[inline]
198    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199        self.columns.par_iter().map(Column::as_materialized_series)
200    }
201
202    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203    ///
204    /// # Implementation
205    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208    ///
209    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210    /// However, this function will yield a smaller number. This is because this function returns
211    /// the visible size of the buffer, not its total capacity.
212    ///
213    /// FFI buffers are included in this estimation.
214    pub fn estimated_size(&self) -> usize {
215        self.columns.iter().map(Column::estimated_size).sum()
216    }
217
218    // Reduce monomorphization.
219    fn try_apply_columns(
220        &self,
221        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222    ) -> PolarsResult<Vec<Column>> {
223        self.columns.iter().map(func).collect()
224    }
225    // Reduce monomorphization.
226    pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227        self.columns.iter().map(func).collect()
228    }
229    // Reduce monomorphization.
230    fn try_apply_columns_par(
231        &self,
232        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233    ) -> PolarsResult<Vec<Column>> {
234        POOL.install(|| self.columns.par_iter().map(func).collect())
235    }
236    // Reduce monomorphization.
237    pub fn _apply_columns_par(
238        &self,
239        func: &(dyn Fn(&Column) -> Column + Send + Sync),
240    ) -> Vec<Column> {
241        POOL.install(|| self.columns.par_iter().map(func).collect())
242    }
243
244    /// Get the index of the column.
245    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246        self.get_column_index(name)
247            .ok_or_else(|| polars_err!(col_not_found = name))
248    }
249
250    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251        polars_ensure!(
252            self.columns.iter().all(|s| s.name().as_str() != name),
253            Duplicate: "column with name {:?} is already present in the DataFrame", name
254        );
255        Ok(())
256    }
257
258    /// Reserve additional slots into the chunks of the series.
259    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260        for s in &mut self.columns {
261            if let Column::Series(s) = s {
262                // SAFETY:
263                // do not modify the data, simply resize.
264                unsafe { s.chunks_mut().reserve(additional) }
265            }
266        }
267    }
268
269    /// Create a DataFrame from a Vector of Series.
270    ///
271    /// Errors if a column names are not unique, or if heights are not all equal.
272    ///
273    /// # Example
274    ///
275    /// ```
276    /// # use polars_core::prelude::*;
277    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279    ///
280    /// let df = DataFrame::new(vec![s0, s1])?;
281    /// # Ok::<(), PolarsError>(())
282    /// ```
283    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284        DataFrame::validate_columns_slice(&columns)
285            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287    }
288
289    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290        for col in &columns {
291            polars_ensure!(
292                col.len() == height,
293                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294                columns[0].name(), height, col.name(), col.len()
295            );
296        }
297
298        Ok(DataFrame {
299            height,
300            columns,
301            cached_schema: OnceLock::new(),
302        })
303    }
304
305    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306    /// columns to match the other columns.
307    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308        // The length of the longest non-unit length column determines the
309        // broadcast length. If all columns are unit-length the broadcast length
310        // is one.
311        let broadcast_len = columns
312            .iter()
313            .map(|s| s.len())
314            .filter(|l| *l != 1)
315            .max()
316            .unwrap_or(1);
317        Self::new_with_broadcast_len(columns, broadcast_len)
318    }
319
320    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321    /// columns to broadcast_len.
322    pub fn new_with_broadcast_len(
323        columns: Vec<Column>,
324        broadcast_len: usize,
325    ) -> PolarsResult<Self> {
326        ensure_names_unique(&columns, |s| s.name().as_str())?;
327        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328    }
329
330    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331    /// columns to match the other columns.
332    ///  
333    /// # Safety
334    /// Does not check that the column names are unique (which they must be).
335    pub unsafe fn new_with_broadcast_no_namecheck(
336        mut columns: Vec<Column>,
337        broadcast_len: usize,
338    ) -> PolarsResult<Self> {
339        for col in &mut columns {
340            // Length not equal to the broadcast len, needs broadcast or is an error.
341            let len = col.len();
342            if len != broadcast_len {
343                if len != 1 {
344                    let name = col.name().to_owned();
345                    let extra_info =
346                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347                            format!(" (matching column '{}')", c.name())
348                        } else {
349                            String::new()
350                        };
351                    polars_bail!(
352                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353                    );
354                }
355                *col = col.new_from_index(0, broadcast_len);
356            }
357        }
358
359        let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362    }
363
364    pub fn new_from_index(&self, index: usize, height: usize) -> Self {
365        let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
366        unsafe { Self::new_no_checks(height, cols.collect()) }
367    }
368
369    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
370    ///
371    /// # Example
372    ///
373    /// ```rust
374    /// use polars_core::prelude::DataFrame;
375    /// static EMPTY: DataFrame = DataFrame::empty();
376    /// ```
377    pub const fn empty() -> Self {
378        Self::empty_with_height(0)
379    }
380
381    /// Creates an empty `DataFrame` with a specific `height`.
382    pub const fn empty_with_height(height: usize) -> Self {
383        DataFrame {
384            height,
385            columns: vec![],
386            cached_schema: OnceLock::new(),
387        }
388    }
389
390    /// Create an empty `DataFrame` with empty columns as per the `schema`.
391    pub fn empty_with_schema(schema: &Schema) -> Self {
392        let cols = schema
393            .iter()
394            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
395            .collect();
396        unsafe { DataFrame::new_no_checks(0, cols) }
397    }
398
399    /// Create an empty `DataFrame` with empty columns as per the `schema`.
400    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
401        let cols = schema
402            .iter_values()
403            .map(|fld| {
404                Column::from(Series::new_empty(
405                    fld.name.clone(),
406                    &(DataType::from_arrow_field(fld)),
407                ))
408            })
409            .collect();
410        unsafe { DataFrame::new_no_checks(0, cols) }
411    }
412
413    /// Create a new `DataFrame` with the given schema, only containing nulls.
414    pub fn full_null(schema: &Schema, height: usize) -> Self {
415        let columns = schema
416            .iter_fields()
417            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
418            .collect();
419        unsafe { DataFrame::new_no_checks(height, columns) }
420    }
421
422    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
423    ///
424    /// # Example
425    ///
426    /// ```rust
427    /// # use polars_core::prelude::*;
428    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
429    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
430    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
431    ///
432    /// assert_eq!(df.pop(), Some(s2));
433    /// assert_eq!(df.pop(), Some(s1));
434    /// assert_eq!(df.pop(), None);
435    /// assert!(df.is_empty());
436    /// # Ok::<(), PolarsError>(())
437    /// ```
438    pub fn pop(&mut self) -> Option<Column> {
439        self.clear_schema();
440
441        self.columns.pop()
442    }
443
444    /// Add a new column at index 0 that counts the rows.
445    ///
446    /// # Example
447    ///
448    /// ```
449    /// # use polars_core::prelude::*;
450    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
451    /// assert_eq!(df1.shape(), (4, 1));
452    ///
453    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
454    /// assert_eq!(df2.shape(), (4, 2));
455    /// println!("{}", df2);
456    ///
457    /// # Ok::<(), PolarsError>(())
458    /// ```
459    ///
460    /// Output:
461    ///
462    /// ```text
463    ///  shape: (4, 2)
464    ///  +-----+----------+
465    ///  | Id  | Name     |
466    ///  | --- | ---      |
467    ///  | u32 | str      |
468    ///  +=====+==========+
469    ///  | 0   | James    |
470    ///  +-----+----------+
471    ///  | 1   | Mary     |
472    ///  +-----+----------+
473    ///  | 2   | John     |
474    ///  +-----+----------+
475    ///  | 3   | Patricia |
476    ///  +-----+----------+
477    /// ```
478    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
479        let mut columns = Vec::with_capacity(self.columns.len() + 1);
480        let offset = offset.unwrap_or(0);
481
482        let col = Column::new_row_index(name, offset, self.height())?;
483        columns.push(col);
484        columns.extend_from_slice(&self.columns);
485        DataFrame::new(columns)
486    }
487
488    /// Add a row index column in place.
489    ///
490    /// # Safety
491    /// The caller should ensure the DataFrame does not already contain a column with the given name.
492    ///
493    /// # Panics
494    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
495    pub unsafe fn with_row_index_mut(
496        &mut self,
497        name: PlSmallStr,
498        offset: Option<IdxSize>,
499    ) -> &mut Self {
500        // TODO: Make this function unsafe
501        debug_assert!(
502            self.columns.iter().all(|c| c.name() != &name),
503            "with_row_index_mut(): column with name {} already exists",
504            &name
505        );
506
507        let offset = offset.unwrap_or(0);
508        let col = Column::new_row_index(name, offset, self.height()).unwrap();
509
510        self.clear_schema();
511        self.columns.insert(0, col);
512        self
513    }
514
515    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
516    /// `Series`.
517    ///
518    /// Calculates the height from the first column or `0` if no columns are given.
519    ///
520    /// # Safety
521    ///
522    /// It is the callers responsibility to uphold the contract of all `Series`
523    /// having an equal length and a unique name, if not this may panic down the line.
524    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
525        let height = columns.first().map_or(0, Column::len);
526        unsafe { Self::new_no_checks(height, columns) }
527    }
528
529    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
530    /// `Series`.
531    ///
532    /// It is advised to use [DataFrame::new] in favor of this method.
533    ///
534    /// # Safety
535    ///
536    /// It is the callers responsibility to uphold the contract of all `Series`
537    /// having an equal length and a unique name, if not this may panic down the line.
538    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
539        if cfg!(debug_assertions) {
540            DataFrame::validate_columns_slice(&columns).unwrap();
541        }
542
543        unsafe { Self::_new_no_checks_impl(height, columns) }
544    }
545
546    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
547    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
548    /// constructed with this method is generally highly unsafe and should not be long-lived.
549    #[allow(clippy::missing_safety_doc)]
550    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
551        DataFrame {
552            height,
553            columns,
554            cached_schema: OnceLock::new(),
555        }
556    }
557
558    /// Shrink the capacity of this DataFrame to fit its length.
559    pub fn shrink_to_fit(&mut self) {
560        // Don't parallelize this. Memory overhead
561        for s in &mut self.columns {
562            s.shrink_to_fit();
563        }
564    }
565
566    /// Aggregate all the chunks in the DataFrame to a single chunk.
567    pub fn as_single_chunk(&mut self) -> &mut Self {
568        // Don't parallelize this. Memory overhead
569        for s in &mut self.columns {
570            *s = s.rechunk();
571        }
572        self
573    }
574
575    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
576    /// This may lead to more peak memory consumption.
577    pub fn as_single_chunk_par(&mut self) -> &mut Self {
578        if self.columns.iter().any(|c| c.n_chunks() > 1) {
579            self.columns = self._apply_columns_par(&|s| s.rechunk());
580        }
581        self
582    }
583
584    /// Rechunks all columns to only have a single chunk.
585    pub fn rechunk_mut(&mut self) {
586        // SAFETY: We never adjust the length or names of the columns.
587        let columns = unsafe { self.get_columns_mut() };
588
589        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
590            *col = col.rechunk();
591        }
592    }
593
594    pub fn _deshare_views_mut(&mut self) {
595        // SAFETY: We never adjust the length or names of the columns.
596        unsafe {
597            let columns = self.get_columns_mut();
598            for col in columns {
599                let Column::Series(s) = col else { continue };
600
601                if let Ok(ca) = s.binary() {
602                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
603                    *col = Column::from(gc_ca.into_series());
604                } else if let Ok(ca) = s.str() {
605                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
606                    *col = Column::from(gc_ca.into_series());
607                }
608            }
609        }
610    }
611
612    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
613    pub fn rechunk_to_record_batch(
614        self,
615        compat_level: CompatLevel,
616    ) -> RecordBatchT<Box<dyn Array>> {
617        let height = self.height();
618
619        let (schema, arrays) = self
620            .columns
621            .into_iter()
622            .map(|col| {
623                let mut series = col.take_materialized_series();
624                // Rechunk to one chunk if necessary
625                if series.n_chunks() > 1 {
626                    series = series.rechunk();
627                }
628                (
629                    series.field().to_arrow(compat_level),
630                    series.to_arrow(0, compat_level),
631                )
632            })
633            .collect();
634
635        RecordBatchT::new(height, Arc::new(schema), arrays)
636    }
637
638    /// Returns true if the chunks of the columns do not align and re-chunking should be done
639    pub fn should_rechunk(&self) -> bool {
640        // Fast check. It is also needed for correctness, as code below doesn't check if the number
641        // of chunks is equal.
642        if !self
643            .get_columns()
644            .iter()
645            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
646            .all_equal()
647        {
648            return true;
649        }
650
651        // From here we check chunk lengths.
652        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
653        match chunk_lengths.next() {
654            None => false,
655            Some(first_column_chunk_lengths) => {
656                // Fast Path for single Chunk Series
657                if first_column_chunk_lengths.size_hint().0 == 1 {
658                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
659                }
660                // Always rechunk if we have more chunks than rows.
661                // except when we have an empty df containing a single chunk
662                let height = self.height();
663                let n_chunks = first_column_chunk_lengths.size_hint().0;
664                if n_chunks > height && !(height == 0 && n_chunks == 1) {
665                    return true;
666                }
667                // Slow Path for multi Chunk series
668                let v: Vec<_> = first_column_chunk_lengths.collect();
669                for cl in chunk_lengths {
670                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
671                        return true;
672                    }
673                }
674                false
675            },
676        }
677    }
678
679    /// Ensure all the chunks in the [`DataFrame`] are aligned.
680    pub fn align_chunks_par(&mut self) -> &mut Self {
681        if self.should_rechunk() {
682            self.as_single_chunk_par()
683        } else {
684            self
685        }
686    }
687
688    pub fn align_chunks(&mut self) -> &mut Self {
689        if self.should_rechunk() {
690            self.as_single_chunk()
691        } else {
692            self
693        }
694    }
695
696    /// Get the [`DataFrame`] schema.
697    ///
698    /// # Example
699    ///
700    /// ```rust
701    /// # use polars_core::prelude::*;
702    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
703    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
704    ///
705    /// let f1: Field = Field::new("Thing".into(), DataType::String);
706    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
707    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
708    ///
709    /// assert_eq!(&**df.schema(), &sc);
710    /// # Ok::<(), PolarsError>(())
711    /// ```
712    pub fn schema(&self) -> &SchemaRef {
713        let out = self.cached_schema.get_or_init(|| {
714            Arc::new(
715                self.columns
716                    .iter()
717                    .map(|x| (x.name().clone(), x.dtype().clone()))
718                    .collect(),
719            )
720        });
721
722        debug_assert_eq!(out.len(), self.width());
723
724        out
725    }
726
727    /// Get a reference to the [`DataFrame`] columns.
728    ///
729    /// # Example
730    ///
731    /// ```rust
732    /// # use polars_core::prelude::*;
733    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
734    ///                         "Symbol" => ["A", "C", "G", "T"])?;
735    /// let columns: &[Column] = df.get_columns();
736    ///
737    /// assert_eq!(columns[0].name(), "Name");
738    /// assert_eq!(columns[1].name(), "Symbol");
739    /// # Ok::<(), PolarsError>(())
740    /// ```
741    #[inline]
742    pub fn get_columns(&self) -> &[Column] {
743        &self.columns
744    }
745
746    #[inline]
747    /// Get mutable access to the underlying columns.
748    ///
749    /// # Safety
750    ///
751    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
752    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
753    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
754    /// calling [`DataFrame::clear_schema`].
755    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
756        &mut self.columns
757    }
758
759    #[inline]
760    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
761    pub fn clear_columns(&mut self) {
762        unsafe { self.get_columns_mut() }.clear();
763        self.clear_schema();
764    }
765
766    #[inline]
767    /// Extend the columns without checking for name collisions or height.
768    ///
769    /// # Safety
770    ///
771    /// The caller needs to ensure that:
772    /// - Column names are unique within the resulting [`DataFrame`].
773    /// - The length of each appended column matches the height of the [`DataFrame`]. For
774    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
775    ///   with [`DataFrame::set_height`].
776    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
777        unsafe { self.get_columns_mut() }.extend(iter);
778        self.clear_schema();
779    }
780
781    /// Take ownership of the underlying columns vec.
782    pub fn take_columns(self) -> Vec<Column> {
783        self.columns
784    }
785
786    /// Iterator over the columns as [`Series`].
787    ///
788    /// # Example
789    ///
790    /// ```rust
791    /// # use polars_core::prelude::*;
792    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
793    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
794    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
795    ///
796    /// let mut iterator = df.iter();
797    ///
798    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
799    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
800    /// assert_eq!(iterator.next(), None);
801    /// # Ok::<(), PolarsError>(())
802    /// ```
803    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
804        self.materialized_column_iter()
805    }
806
807    /// # Example
808    ///
809    /// ```rust
810    /// # use polars_core::prelude::*;
811    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
812    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
813    ///
814    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
815    /// # Ok::<(), PolarsError>(())
816    /// ```
817    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
818        self.columns.iter().map(|s| s.name()).collect()
819    }
820
821    /// Get the [`Vec<PlSmallStr>`] representing the column names.
822    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
823        self.columns.iter().map(|s| s.name().clone()).collect()
824    }
825
826    pub fn get_column_names_str(&self) -> Vec<&str> {
827        self.columns.iter().map(|s| s.name().as_str()).collect()
828    }
829
830    /// Set the column names.
831    /// # Example
832    ///
833    /// ```rust
834    /// # use polars_core::prelude::*;
835    /// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
836    /// df.set_column_names(["Set"])?;
837    ///
838    /// assert_eq!(df.get_column_names(), &["Set"]);
839    /// # Ok::<(), PolarsError>(())
840    /// ```
841    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
842    where
843        I: IntoIterator<Item = S>,
844        S: Into<PlSmallStr>,
845    {
846        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
847        self._set_column_names_impl(names.as_slice())
848    }
849
850    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
851        polars_ensure!(
852            names.len() == self.width(),
853            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
854            names.len(), self.width()
855        );
856        ensure_names_unique(names, |s| s.as_str())?;
857
858        let columns = mem::take(&mut self.columns);
859        self.columns = columns
860            .into_iter()
861            .zip(names)
862            .map(|(s, name)| {
863                let mut s = s;
864                s.rename(name.clone());
865                s
866            })
867            .collect();
868        self.clear_schema();
869        Ok(())
870    }
871
872    /// Get the data types of the columns in the [`DataFrame`].
873    ///
874    /// # Example
875    ///
876    /// ```rust
877    /// # use polars_core::prelude::*;
878    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
879    ///                                "Fraction" => [0.965, 0.035])?;
880    ///
881    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
882    /// # Ok::<(), PolarsError>(())
883    /// ```
884    pub fn dtypes(&self) -> Vec<DataType> {
885        self.columns.iter().map(|s| s.dtype().clone()).collect()
886    }
887
888    pub(crate) fn first_series_column(&self) -> Option<&Series> {
889        self.columns.iter().find_map(|col| col.as_series())
890    }
891
892    /// The number of chunks for the first column.
893    pub fn first_col_n_chunks(&self) -> usize {
894        match self.first_series_column() {
895            None if self.columns.is_empty() => 0,
896            None => 1,
897            Some(s) => s.n_chunks(),
898        }
899    }
900
901    /// The highest number of chunks for any column.
902    pub fn max_n_chunks(&self) -> usize {
903        self.columns
904            .iter()
905            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
906            .max()
907            .unwrap_or(0)
908    }
909
910    /// Get a reference to the schema fields of the [`DataFrame`].
911    ///
912    /// # Example
913    ///
914    /// ```rust
915    /// # use polars_core::prelude::*;
916    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
917    ///                            "Fraction" => [0.708, 0.292])?;
918    ///
919    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
920    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
921    ///
922    /// assert_eq!(earth.fields(), &[f1, f2]);
923    /// # Ok::<(), PolarsError>(())
924    /// ```
925    pub fn fields(&self) -> Vec<Field> {
926        self.columns
927            .iter()
928            .map(|s| s.field().into_owned())
929            .collect()
930    }
931
932    /// Get (height, width) of the [`DataFrame`].
933    ///
934    /// # Example
935    ///
936    /// ```rust
937    /// # use polars_core::prelude::*;
938    /// let df0: DataFrame = DataFrame::default();
939    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
940    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
941    ///                          "2" => [1, 2, 3, 4, 5])?;
942    ///
943    /// assert_eq!(df0.shape(), (0 ,0));
944    /// assert_eq!(df1.shape(), (5, 1));
945    /// assert_eq!(df2.shape(), (5, 2));
946    /// # Ok::<(), PolarsError>(())
947    /// ```
948    pub fn shape(&self) -> (usize, usize) {
949        (self.height, self.columns.len())
950    }
951
952    /// Get the width of the [`DataFrame`] which is the number of columns.
953    ///
954    /// # Example
955    ///
956    /// ```rust
957    /// # use polars_core::prelude::*;
958    /// let df0: DataFrame = DataFrame::default();
959    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
960    /// let df2: DataFrame = df!("Series 1" => [0; 0],
961    ///                          "Series 2" => [0; 0])?;
962    ///
963    /// assert_eq!(df0.width(), 0);
964    /// assert_eq!(df1.width(), 1);
965    /// assert_eq!(df2.width(), 2);
966    /// # Ok::<(), PolarsError>(())
967    /// ```
968    pub fn width(&self) -> usize {
969        self.columns.len()
970    }
971
972    /// Get the height of the [`DataFrame`] which is the number of rows.
973    ///
974    /// # Example
975    ///
976    /// ```rust
977    /// # use polars_core::prelude::*;
978    /// let df0: DataFrame = DataFrame::default();
979    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
980    /// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;
981    ///
982    /// assert_eq!(df0.height(), 0);
983    /// assert_eq!(df1.height(), 2);
984    /// assert_eq!(df2.height(), 5);
985    /// # Ok::<(), PolarsError>(())
986    /// ```
987    pub fn height(&self) -> usize {
988        self.height
989    }
990
991    /// Returns the size as number of rows * number of columns
992    pub fn size(&self) -> usize {
993        let s = self.shape();
994        s.0 * s.1
995    }
996
997    /// Returns `true` if the [`DataFrame`] contains no rows.
998    ///
999    /// # Example
1000    ///
1001    /// ```rust
1002    /// # use polars_core::prelude::*;
1003    /// let df1: DataFrame = DataFrame::default();
1004    /// assert!(df1.is_empty());
1005    ///
1006    /// let df2: DataFrame = df!("First name" => ["Forever"],
1007    ///                          "Last name" => ["Alone"])?;
1008    /// assert!(!df2.is_empty());
1009    /// # Ok::<(), PolarsError>(())
1010    /// ```
1011    pub fn is_empty(&self) -> bool {
1012        matches!(self.shape(), (0, _) | (_, 0))
1013    }
1014
1015    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1016    ///
1017    /// # Safety
1018    ///
1019    /// This needs to be equal to the length of all the columns.
1020    pub unsafe fn set_height(&mut self, height: usize) {
1021        self.height = height;
1022    }
1023
1024    /// Add multiple [`Series`] to a [`DataFrame`].
1025    /// The added `Series` are required to have the same length.
1026    ///
1027    /// # Example
1028    ///
1029    /// ```rust
1030    /// # use polars_core::prelude::*;
1031    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1032    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1033    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1034    ///
1035    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1036    /// assert_eq!(df2.shape(), (3, 3));
1037    /// println!("{}", df2);
1038    /// # Ok::<(), PolarsError>(())
1039    /// ```
1040    ///
1041    /// Output:
1042    ///
1043    /// ```text
1044    /// shape: (3, 3)
1045    /// +---------+--------+----------+
1046    /// | Element | Proton | Electron |
1047    /// | ---     | ---    | ---      |
1048    /// | str     | i32    | i32      |
1049    /// +=========+========+==========+
1050    /// | Copper  | 29     | 29       |
1051    /// +---------+--------+----------+
1052    /// | Silver  | 47     | 47       |
1053    /// +---------+--------+----------+
1054    /// | Gold    | 79     | 79       |
1055    /// +---------+--------+----------+
1056    /// ```
1057    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1058        let mut new_cols = self.columns.clone();
1059        new_cols.extend_from_slice(columns);
1060        DataFrame::new(new_cols)
1061    }
1062
1063    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1064    ///
1065    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1066    ///
1067    /// # Example
1068    ///
1069    /// ```rust
1070    /// # use polars_core::prelude::*;
1071    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1072    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1073    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1074    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1075    ///
1076    /// let df3: DataFrame = df1.vstack(&df2)?;
1077    ///
1078    /// assert_eq!(df3.shape(), (5, 2));
1079    /// println!("{}", df3);
1080    /// # Ok::<(), PolarsError>(())
1081    /// ```
1082    ///
1083    /// Output:
1084    ///
1085    /// ```text
1086    /// shape: (5, 2)
1087    /// +-----------+-------------------+
1088    /// | Element   | Melting Point (K) |
1089    /// | ---       | ---               |
1090    /// | str       | f64               |
1091    /// +===========+===================+
1092    /// | Copper    | 1357.77           |
1093    /// +-----------+-------------------+
1094    /// | Silver    | 1234.93           |
1095    /// +-----------+-------------------+
1096    /// | Gold      | 1337.33           |
1097    /// +-----------+-------------------+
1098    /// | Platinum  | 2041.4            |
1099    /// +-----------+-------------------+
1100    /// | Palladium | 1828.05           |
1101    /// +-----------+-------------------+
1102    /// ```
1103    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1104        let mut df = self.clone();
1105        df.vstack_mut(other)?;
1106        Ok(df)
1107    }
1108
1109    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1110    ///
1111    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1112    ///
1113    /// # Example
1114    ///
1115    /// ```rust
1116    /// # use polars_core::prelude::*;
1117    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1118    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1119    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1120    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1121    ///
1122    /// df1.vstack_mut(&df2)?;
1123    ///
1124    /// assert_eq!(df1.shape(), (5, 2));
1125    /// println!("{}", df1);
1126    /// # Ok::<(), PolarsError>(())
1127    /// ```
1128    ///
1129    /// Output:
1130    ///
1131    /// ```text
1132    /// shape: (5, 2)
1133    /// +-----------+-------------------+
1134    /// | Element   | Melting Point (K) |
1135    /// | ---       | ---               |
1136    /// | str       | f64               |
1137    /// +===========+===================+
1138    /// | Copper    | 1357.77           |
1139    /// +-----------+-------------------+
1140    /// | Silver    | 1234.93           |
1141    /// +-----------+-------------------+
1142    /// | Gold      | 1337.33           |
1143    /// +-----------+-------------------+
1144    /// | Platinum  | 2041.4            |
1145    /// +-----------+-------------------+
1146    /// | Palladium | 1828.05           |
1147    /// +-----------+-------------------+
1148    /// ```
1149    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1150        if self.width() != other.width() {
1151            polars_ensure!(
1152                self.width() == 0,
1153                ShapeMismatch:
1154                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1155                self.width(), other.width(),
1156            );
1157            self.columns.clone_from(&other.columns);
1158            self.height = other.height;
1159            return Ok(self);
1160        }
1161
1162        self.columns
1163            .iter_mut()
1164            .zip(other.columns.iter())
1165            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1166                ensure_can_extend(&*left, right)?;
1167                left.append(right).map_err(|e| {
1168                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1169                })?;
1170                Ok(())
1171            })?;
1172        self.height += other.height;
1173        Ok(self)
1174    }
1175
1176    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1177        if self.width() != other.width() {
1178            polars_ensure!(
1179                self.width() == 0,
1180                ShapeMismatch:
1181                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1182                self.width(), other.width(),
1183            );
1184            self.columns = other.columns;
1185            self.height = other.height;
1186            return Ok(self);
1187        }
1188
1189        self.columns
1190            .iter_mut()
1191            .zip(other.columns.into_iter())
1192            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1193                ensure_can_extend(&*left, &right)?;
1194                let right_name = right.name().clone();
1195                left.append_owned(right).map_err(|e| {
1196                    e.context(format!("failed to vstack column '{right_name}'").into())
1197                })?;
1198                Ok(())
1199            })?;
1200        self.height += other.height;
1201        Ok(self)
1202    }
1203
1204    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1205    ///
1206    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1207    ///
1208    /// # Panics
1209    /// Panics if the schema's don't match.
1210    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1211        self.columns
1212            .iter_mut()
1213            .zip(other.columns.iter())
1214            .for_each(|(left, right)| {
1215                left.append(right)
1216                    .map_err(|e| {
1217                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1218                    })
1219                    .expect("should not fail");
1220            });
1221        self.height += other.height;
1222    }
1223
1224    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1225    ///
1226    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1227    ///
1228    /// # Panics
1229    /// Panics if the schema's don't match.
1230    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1231        self.columns
1232            .iter_mut()
1233            .zip(other.columns)
1234            .for_each(|(left, right)| {
1235                left.append_owned(right).expect("should not fail");
1236            });
1237        self.height += other.height;
1238    }
1239
1240    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1241    ///
1242    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1243    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1244    ///
1245    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1246    /// and thus will yield faster queries.
1247    ///
1248    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1249    /// online operations where you add `n` rows and rerun a query.
1250    ///
1251    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1252    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1253    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1254    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1255        polars_ensure!(
1256            self.width() == other.width(),
1257            ShapeMismatch:
1258            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1259            self.width(), other.width(),
1260        );
1261
1262        self.columns
1263            .iter_mut()
1264            .zip(other.columns.iter())
1265            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1266                ensure_can_extend(&*left, right)?;
1267                left.extend(right).map_err(|e| {
1268                    e.context(format!("failed to extend column '{}'", right.name()).into())
1269                })?;
1270                Ok(())
1271            })?;
1272        self.height += other.height;
1273        self.clear_schema();
1274        Ok(())
1275    }
1276
1277    /// Remove a column by name and return the column removed.
1278    ///
1279    /// # Example
1280    ///
1281    /// ```rust
1282    /// # use polars_core::prelude::*;
1283    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1284    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1285    ///
1286    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1287    /// assert!(s1.is_err());
1288    ///
1289    /// let s2: Column = df.drop_in_place("Animal")?;
1290    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1291    /// # Ok::<(), PolarsError>(())
1292    /// ```
1293    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1294        let idx = self.check_name_to_idx(name)?;
1295        self.clear_schema();
1296        Ok(self.columns.remove(idx))
1297    }
1298
1299    /// Return a new [`DataFrame`] where all null values are dropped.
1300    ///
1301    /// # Example
1302    ///
1303    /// ```no_run
1304    /// # use polars_core::prelude::*;
1305    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1306    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1307    /// assert_eq!(df1.shape(), (3, 2));
1308    ///
1309    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1310    /// assert_eq!(df2.shape(), (1, 2));
1311    /// println!("{}", df2);
1312    /// # Ok::<(), PolarsError>(())
1313    /// ```
1314    ///
1315    /// Output:
1316    ///
1317    /// ```text
1318    /// shape: (1, 2)
1319    /// +---------+---------------------+
1320    /// | Country | Tax revenue (% GDP) |
1321    /// | ---     | ---                 |
1322    /// | str     | f64                 |
1323    /// +=========+=====================+
1324    /// | Malta   | 32.7                |
1325    /// +---------+---------------------+
1326    /// ```
1327    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1328    where
1329        for<'a> &'a S: Into<PlSmallStr>,
1330    {
1331        if let Some(v) = subset {
1332            let v = self.select_columns(v)?;
1333            self._drop_nulls_impl(v.as_slice())
1334        } else {
1335            self._drop_nulls_impl(self.columns.as_slice())
1336        }
1337    }
1338
1339    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1340        // fast path for no nulls in df
1341        if subset.iter().all(|s| !s.has_nulls()) {
1342            return Ok(self.clone());
1343        }
1344
1345        let mut iter = subset.iter();
1346
1347        let mask = iter
1348            .next()
1349            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1350        let mut mask = mask.is_not_null();
1351
1352        for c in iter {
1353            mask = mask & c.is_not_null();
1354        }
1355        self.filter(&mask)
1356    }
1357
1358    /// Drop a column by name.
1359    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1360    /// the current one in place.
1361    ///
1362    /// # Example
1363    ///
1364    /// ```rust
1365    /// # use polars_core::prelude::*;
1366    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1367    /// let df2: DataFrame = df1.drop("Ray type")?;
1368    ///
1369    /// assert!(df2.is_empty());
1370    /// # Ok::<(), PolarsError>(())
1371    /// ```
1372    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1373        let idx = self.check_name_to_idx(name)?;
1374        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1375
1376        self.columns.iter().enumerate().for_each(|(i, s)| {
1377            if i != idx {
1378                new_cols.push(s.clone())
1379            }
1380        });
1381
1382        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1383    }
1384
1385    /// Drop columns that are in `names`.
1386    pub fn drop_many<I, S>(&self, names: I) -> Self
1387    where
1388        I: IntoIterator<Item = S>,
1389        S: Into<PlSmallStr>,
1390    {
1391        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1392        self.drop_many_amortized(&names)
1393    }
1394
1395    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1396    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1397        if names.is_empty() {
1398            return self.clone();
1399        }
1400        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1401        self.columns.iter().for_each(|s| {
1402            if !names.contains(s.name()) {
1403                new_cols.push(s.clone())
1404            }
1405        });
1406
1407        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1408    }
1409
1410    /// Insert a new column at a given index without checking for duplicates.
1411    /// This can leave the [`DataFrame`] at an invalid state
1412    fn insert_column_no_name_check(
1413        &mut self,
1414        index: usize,
1415        column: Column,
1416    ) -> PolarsResult<&mut Self> {
1417        polars_ensure!(
1418            self.width() == 0 || column.len() == self.height(),
1419            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1420            column.len(), self.height(),
1421        );
1422
1423        if self.width() == 0 {
1424            self.height = column.len();
1425        }
1426
1427        self.columns.insert(index, column);
1428        self.clear_schema();
1429        Ok(self)
1430    }
1431
1432    /// Insert a new column at a given index.
1433    pub fn insert_column<S: IntoColumn>(
1434        &mut self,
1435        index: usize,
1436        column: S,
1437    ) -> PolarsResult<&mut Self> {
1438        let column = column.into_column();
1439        self.check_already_present(column.name().as_str())?;
1440        self.insert_column_no_name_check(index, column)
1441    }
1442
1443    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1444        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1445            self.replace_column(idx, column)?;
1446        } else {
1447            if self.width() == 0 {
1448                self.height = column.len();
1449            }
1450
1451            self.columns.push(column);
1452            self.clear_schema();
1453        }
1454        Ok(())
1455    }
1456
1457    /// Add a new column to this [`DataFrame`] or replace an existing one.
1458    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1459        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1460            let height = df.height();
1461            if column.len() == 1 && height > 1 {
1462                column = column.new_from_index(0, height);
1463            }
1464
1465            if column.len() == height || df.get_columns().is_empty() {
1466                df.add_column_by_search(column)?;
1467                Ok(df)
1468            }
1469            // special case for literals
1470            else if height == 0 && column.len() == 1 {
1471                let s = column.clear();
1472                df.add_column_by_search(s)?;
1473                Ok(df)
1474            } else {
1475                polars_bail!(
1476                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1477                    column.len(), height,
1478                );
1479            }
1480        }
1481        let column = column.into_column();
1482        inner(self, column)
1483    }
1484
1485    /// Adds a column to the [`DataFrame`] without doing any checks
1486    /// on length or duplicates.
1487    ///
1488    /// # Safety
1489    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1490    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1491        debug_assert!(self.width() == 0 || self.height() == column.len());
1492        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1493
1494        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1495        // properly for `width` == 0.
1496        if self.width() == 0 {
1497            unsafe { self.set_height(column.len()) };
1498        }
1499        unsafe { self.get_columns_mut() }.push(column);
1500        self.clear_schema();
1501
1502        self
1503    }
1504
1505    // Note: Schema can be both input or output_schema
1506    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1507        let name = c.name();
1508        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1509            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1510                // Given schema is output_schema and we can push.
1511                if idx == self.columns.len() {
1512                    if self.width() == 0 {
1513                        self.height = c.len();
1514                    }
1515
1516                    self.columns.push(c);
1517                    self.clear_schema();
1518                }
1519                // Schema is incorrect fallback to search
1520                else {
1521                    debug_assert!(false);
1522                    self.add_column_by_search(c)?;
1523                }
1524            } else {
1525                self.replace_column(idx, c)?;
1526            }
1527        } else {
1528            if self.width() == 0 {
1529                self.height = c.len();
1530            }
1531
1532            self.columns.push(c);
1533            self.clear_schema();
1534        }
1535
1536        Ok(())
1537    }
1538
1539    // Note: Schema can be both input or output_schema
1540    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1541        for (i, s) in series.into_iter().enumerate() {
1542            // we need to branch here
1543            // because users can add multiple columns with the same name
1544            if i == 0 || schema.get(s.name().as_str()).is_some() {
1545                self.with_column_and_schema(s.into_column(), schema)?;
1546            } else {
1547                self.with_column(s.clone().into_column())?;
1548            }
1549        }
1550        Ok(())
1551    }
1552
1553    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1554        for (i, s) in columns.into_iter().enumerate() {
1555            // we need to branch here
1556            // because users can add multiple columns with the same name
1557            if i == 0 || schema.get(s.name().as_str()).is_some() {
1558                self.with_column_and_schema(s, schema)?;
1559            } else {
1560                self.with_column(s.clone())?;
1561            }
1562        }
1563
1564        Ok(())
1565    }
1566
1567    /// Add a new column to this [`DataFrame`] or replace an existing one.
1568    /// Uses an existing schema to amortize lookups.
1569    /// If the schema is incorrect, we will fallback to linear search.
1570    ///
1571    /// Note: Schema can be both input or output_schema
1572    pub fn with_column_and_schema<C: IntoColumn>(
1573        &mut self,
1574        column: C,
1575        schema: &Schema,
1576    ) -> PolarsResult<&mut Self> {
1577        let mut column = column.into_column();
1578
1579        let height = self.height();
1580        if column.len() == 1 && height > 1 {
1581            column = column.new_from_index(0, height);
1582        }
1583
1584        if column.len() == height || self.columns.is_empty() {
1585            self.add_column_by_schema(column, schema)?;
1586            Ok(self)
1587        }
1588        // special case for literals
1589        else if height == 0 && column.len() == 1 {
1590            let s = column.clear();
1591            self.add_column_by_schema(s, schema)?;
1592            Ok(self)
1593        } else {
1594            polars_bail!(
1595                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1596                column.len(), height,
1597            );
1598        }
1599    }
1600
1601    /// Get a row in the [`DataFrame`]. Beware this is slow.
1602    ///
1603    /// # Example
1604    ///
1605    /// ```
1606    /// # use polars_core::prelude::*;
1607    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1608    ///     df.get(idx)
1609    /// }
1610    /// ```
1611    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1612        match self.columns.first() {
1613            Some(s) => {
1614                if s.len() <= idx {
1615                    return None;
1616                }
1617            },
1618            None => return None,
1619        }
1620        // SAFETY: we just checked bounds
1621        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1622    }
1623
1624    /// Select a [`Series`] by index.
1625    ///
1626    /// # Example
1627    ///
1628    /// ```rust
1629    /// # use polars_core::prelude::*;
1630    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1631    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1632    ///
1633    /// let s1: Option<&Column> = df.select_at_idx(0);
1634    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1635    ///
1636    /// assert_eq!(s1, Some(&s2));
1637    /// # Ok::<(), PolarsError>(())
1638    /// ```
1639    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1640        self.columns.get(idx)
1641    }
1642
1643    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1644    ///
1645    /// # Examples
1646    ///
1647    /// ```rust
1648    /// # use polars_core::prelude::*;
1649    /// let df = df! {
1650    ///     "0" => [0, 0, 0],
1651    ///     "1" => [1, 1, 1],
1652    ///     "2" => [2, 2, 2]
1653    /// }?;
1654    ///
1655    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1656    /// assert!(df.equals(&df.select_by_range(..)?));
1657    /// # Ok::<(), PolarsError>(())
1658    /// ```
1659    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1660    where
1661        R: ops::RangeBounds<usize>,
1662    {
1663        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1664        // because it is the nightly feature. We should change here if this function were stable.
1665        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1666        where
1667            R: ops::RangeBounds<usize>,
1668        {
1669            let len = bounds.end;
1670
1671            let start: ops::Bound<&usize> = range.start_bound();
1672            let start = match start {
1673                ops::Bound::Included(&start) => start,
1674                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1675                    panic!("attempted to index slice from after maximum usize");
1676                }),
1677                ops::Bound::Unbounded => 0,
1678            };
1679
1680            let end: ops::Bound<&usize> = range.end_bound();
1681            let end = match end {
1682                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1683                    panic!("attempted to index slice up to maximum usize");
1684                }),
1685                ops::Bound::Excluded(&end) => end,
1686                ops::Bound::Unbounded => len,
1687            };
1688
1689            if start > end {
1690                panic!("slice index starts at {start} but ends at {end}");
1691            }
1692            if end > len {
1693                panic!("range end index {end} out of range for slice of length {len}",);
1694            }
1695
1696            ops::Range { start, end }
1697        }
1698
1699        let colnames = self.get_column_names_owned();
1700        let range = get_range(range, ..colnames.len());
1701
1702        self._select_impl(&colnames[range])
1703    }
1704
1705    /// Get column index of a [`Series`] by name.
1706    /// # Example
1707    ///
1708    /// ```rust
1709    /// # use polars_core::prelude::*;
1710    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1711    ///                         "Health" => [100, 200, 500],
1712    ///                         "Mana" => [250, 100, 0],
1713    ///                         "Strength" => [30, 150, 300])?;
1714    ///
1715    /// assert_eq!(df.get_column_index("Name"), Some(0));
1716    /// assert_eq!(df.get_column_index("Health"), Some(1));
1717    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1718    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1719    /// assert_eq!(df.get_column_index("Haste"), None);
1720    /// # Ok::<(), PolarsError>(())
1721    /// ```
1722    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1723        let schema = self.schema();
1724        if let Some(idx) = schema.index_of(name) {
1725            if self
1726                .get_columns()
1727                .get(idx)
1728                .is_some_and(|c| c.name() == name)
1729            {
1730                return Some(idx);
1731            }
1732        }
1733
1734        self.columns.iter().position(|s| s.name().as_str() == name)
1735    }
1736
1737    /// Get column index of a [`Series`] by name.
1738    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1739        self.get_column_index(name)
1740            .ok_or_else(|| polars_err!(col_not_found = name))
1741    }
1742
1743    /// Select a single column by name.
1744    ///
1745    /// # Example
1746    ///
1747    /// ```rust
1748    /// # use polars_core::prelude::*;
1749    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1750    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1751    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1752    ///
1753    /// assert_eq!(df.column("Password")?, &s1);
1754    /// # Ok::<(), PolarsError>(())
1755    /// ```
1756    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1757        let idx = self.try_get_column_index(name)?;
1758        Ok(self.select_at_idx(idx).unwrap())
1759    }
1760
1761    /// Selected multiple columns by name.
1762    ///
1763    /// # Example
1764    ///
1765    /// ```rust
1766    /// # use polars_core::prelude::*;
1767    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1768    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1769    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1770    ///
1771    /// assert_eq!(&df[0], sv[0]);
1772    /// assert_eq!(&df[1], sv[1]);
1773    /// # Ok::<(), PolarsError>(())
1774    /// ```
1775    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1776    where
1777        I: IntoIterator<Item = S>,
1778        S: AsRef<str>,
1779    {
1780        names
1781            .into_iter()
1782            .map(|name| self.column(name.as_ref()))
1783            .collect()
1784    }
1785
1786    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1787    ///
1788    /// # Examples
1789    ///
1790    /// ```
1791    /// # use polars_core::prelude::*;
1792    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1793    ///     df.select(["foo", "bar"])
1794    /// }
1795    /// ```
1796    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1797    where
1798        I: IntoIterator<Item = S>,
1799        S: Into<PlSmallStr>,
1800    {
1801        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1802        self._select_impl(cols.as_slice())
1803    }
1804
1805    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1806        ensure_names_unique(cols, |s| s.as_str())?;
1807        self._select_impl_unchecked(cols)
1808    }
1809
1810    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1811        let selected = self.select_columns_impl(cols)?;
1812        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1813    }
1814
1815    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1816    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1817    where
1818        I: IntoIterator<Item = S>,
1819        S: Into<PlSmallStr>,
1820    {
1821        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1822        self._select_with_schema_impl(&cols, schema, true)
1823    }
1824
1825    /// Select with a known schema without checking for duplicates in `selection`.
1826    /// The schema names must match the column names of this DataFrame.
1827    pub fn select_with_schema_unchecked<I, S>(
1828        &self,
1829        selection: I,
1830        schema: &Schema,
1831    ) -> PolarsResult<Self>
1832    where
1833        I: IntoIterator<Item = S>,
1834        S: Into<PlSmallStr>,
1835    {
1836        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1837        self._select_with_schema_impl(&cols, schema, false)
1838    }
1839
1840    /// * The schema names must match the column names of this DataFrame.
1841    pub fn _select_with_schema_impl(
1842        &self,
1843        cols: &[PlSmallStr],
1844        schema: &Schema,
1845        check_duplicates: bool,
1846    ) -> PolarsResult<Self> {
1847        if check_duplicates {
1848            ensure_names_unique(cols, |s| s.as_str())?;
1849        }
1850
1851        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1852        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1853    }
1854
1855    /// A non generic implementation to reduce compiler bloat.
1856    fn select_columns_impl_with_schema(
1857        &self,
1858        cols: &[PlSmallStr],
1859        schema: &Schema,
1860    ) -> PolarsResult<Vec<Column>> {
1861        if cfg!(debug_assertions) {
1862            ensure_matching_schema_names(schema, self.schema())?;
1863        }
1864
1865        cols.iter()
1866            .map(|name| {
1867                let index = schema.try_get_full(name.as_str())?.0;
1868                Ok(self.columns[index].clone())
1869            })
1870            .collect()
1871    }
1872
1873    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1874    where
1875        I: IntoIterator<Item = S>,
1876        S: Into<PlSmallStr>,
1877    {
1878        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1879        self.select_physical_impl(&cols)
1880    }
1881
1882    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1883        ensure_names_unique(cols, |s| s.as_str())?;
1884        let selected = self.select_columns_physical_impl(cols)?;
1885        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1886    }
1887
1888    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1889    ///
1890    /// # Example
1891    ///
1892    /// ```rust
1893    /// # use polars_core::prelude::*;
1894    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1895    ///                         "Carbon" => [1, 2, 3],
1896    ///                         "Hydrogen" => [4, 6, 8])?;
1897    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1898    ///
1899    /// assert_eq!(df["Carbon"], sv[0]);
1900    /// assert_eq!(df["Hydrogen"], sv[1]);
1901    /// # Ok::<(), PolarsError>(())
1902    /// ```
1903    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1904        let cols = selection.into_vec();
1905        self.select_columns_impl(&cols)
1906    }
1907
1908    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1909        self.columns
1910            .iter()
1911            .enumerate()
1912            .map(|(i, s)| (s.name().as_str(), i))
1913            .collect()
1914    }
1915
1916    /// A non generic implementation to reduce compiler bloat.
1917    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1918        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1919            let name_to_idx = self._names_to_idx_map();
1920            cols.iter()
1921                .map(|name| {
1922                    let idx = *name_to_idx
1923                        .get(name.as_str())
1924                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1925                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1926                })
1927                .collect::<PolarsResult<Vec<_>>>()?
1928        } else {
1929            cols.iter()
1930                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1931                .collect::<PolarsResult<Vec<_>>>()?
1932        };
1933
1934        Ok(selected)
1935    }
1936
1937    /// A non generic implementation to reduce compiler bloat.
1938    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1939        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1940            // we hash, because there are user that having millions of columns.
1941            // # https://github.com/pola-rs/polars/issues/1023
1942            let name_to_idx = self._names_to_idx_map();
1943
1944            cols.iter()
1945                .map(|name| {
1946                    let idx = *name_to_idx
1947                        .get(name.as_str())
1948                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1949                    Ok(self.select_at_idx(idx).unwrap().clone())
1950                })
1951                .collect::<PolarsResult<Vec<_>>>()?
1952        } else {
1953            cols.iter()
1954                .map(|c| self.column(c.as_str()).cloned())
1955                .collect::<PolarsResult<Vec<_>>>()?
1956        };
1957
1958        Ok(selected)
1959    }
1960
1961    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1962        // If there is a filtered column just see how many columns there are left.
1963        if let Some(fst) = filtered.first() {
1964            return fst.len();
1965        }
1966
1967        // Otherwise, count the number of values that would be filtered and return that height.
1968        let num_trues = mask.num_trues();
1969        if mask.len() == self.height() {
1970            num_trues
1971        } else {
1972            // This is for broadcasting masks
1973            debug_assert!(num_trues == 0 || num_trues == 1);
1974            self.height() * num_trues
1975        }
1976    }
1977
1978    /// Take the [`DataFrame`] rows by a boolean mask.
1979    ///
1980    /// # Example
1981    ///
1982    /// ```
1983    /// # use polars_core::prelude::*;
1984    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1985    ///     let mask = df.column("sepal_width")?.is_not_null();
1986    ///     df.filter(&mask)
1987    /// }
1988    /// ```
1989    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1990        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1991        let height = self.filter_height(&new_col, mask);
1992
1993        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1994    }
1995
1996    /// Same as `filter` but does not parallelize.
1997    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1998        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1999        let height = self.filter_height(&new_col, mask);
2000
2001        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2002    }
2003
2004    /// Take [`DataFrame`] rows by index values.
2005    ///
2006    /// # Example
2007    ///
2008    /// ```
2009    /// # use polars_core::prelude::*;
2010    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2011    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2012    ///     df.take(&idx)
2013    /// }
2014    /// ```
2015    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2016        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2017
2018        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2019    }
2020
2021    /// # Safety
2022    /// The indices must be in-bounds.
2023    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2024        self.take_unchecked_impl(idx, true)
2025    }
2026
2027    /// # Safety
2028    /// The indices must be in-bounds.
2029    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2030        let cols = if allow_threads {
2031            POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2032        } else {
2033            self._apply_columns(&|s| s.take_unchecked(idx))
2034        };
2035        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2036    }
2037
2038    /// # Safety
2039    /// The indices must be in-bounds.
2040    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2041        self.take_slice_unchecked_impl(idx, true)
2042    }
2043
2044    /// # Safety
2045    /// The indices must be in-bounds.
2046    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2047        let cols = if allow_threads {
2048            POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2049        } else {
2050            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2051        };
2052        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2053    }
2054
2055    /// Rename a column in the [`DataFrame`].
2056    ///
2057    /// # Example
2058    ///
2059    /// ```
2060    /// # use polars_core::prelude::*;
2061    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2062    ///     let original_name = "foo";
2063    ///     let new_name = "bar";
2064    ///     df.rename(original_name, new_name.into())
2065    /// }
2066    /// ```
2067    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2068        if column == name.as_str() {
2069            return Ok(self);
2070        }
2071        polars_ensure!(
2072            !self.schema().contains(&name),
2073            Duplicate: "column rename attempted with already existing name \"{name}\""
2074        );
2075
2076        self.get_column_index(column)
2077            .and_then(|idx| self.columns.get_mut(idx))
2078            .ok_or_else(|| polars_err!(col_not_found = column))
2079            .map(|c| c.rename(name))?;
2080        Ok(self)
2081    }
2082
2083    /// Sort [`DataFrame`] in place.
2084    ///
2085    /// See [`DataFrame::sort`] for more instruction.
2086    pub fn sort_in_place(
2087        &mut self,
2088        by: impl IntoVec<PlSmallStr>,
2089        sort_options: SortMultipleOptions,
2090    ) -> PolarsResult<&mut Self> {
2091        let by_column = self.select_columns(by)?;
2092        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2093        Ok(self)
2094    }
2095
2096    #[doc(hidden)]
2097    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2098    pub fn sort_impl(
2099        &self,
2100        by_column: Vec<Column>,
2101        mut sort_options: SortMultipleOptions,
2102        slice: Option<(i64, usize)>,
2103    ) -> PolarsResult<Self> {
2104        if by_column.is_empty() {
2105            // If no columns selected, any order (including original order) is correct.
2106            return if let Some((offset, len)) = slice {
2107                Ok(self.slice(offset, len))
2108            } else {
2109                Ok(self.clone())
2110            };
2111        }
2112
2113        // note that the by_column argument also contains evaluated expression from
2114        // polars-lazy that may not even be present in this dataframe. therefore
2115        // when we try to set the first columns as sorted, we ignore the error as
2116        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2117        let first_descending = sort_options.descending[0];
2118        let first_by_column = by_column[0].name().to_string();
2119
2120        let set_sorted = |df: &mut DataFrame| {
2121            // Mark the first sort column as sorted; if the column does not exist it
2122            // is ok, because we sorted by an expression not present in the dataframe
2123            let _ = df.apply(&first_by_column, |s| {
2124                let mut s = s.clone();
2125                if first_descending {
2126                    s.set_sorted_flag(IsSorted::Descending)
2127                } else {
2128                    s.set_sorted_flag(IsSorted::Ascending)
2129                }
2130                s
2131            });
2132        };
2133        if self.is_empty() {
2134            let mut out = self.clone();
2135            set_sorted(&mut out);
2136            return Ok(out);
2137        }
2138
2139        if let Some((0, k)) = slice {
2140            if k < self.len() {
2141                return self.bottom_k_impl(k, by_column, sort_options);
2142            }
2143        }
2144        // Check if the required column is already sorted; if so we can exit early
2145        // We can do so when there is only one column to sort by, for multiple columns
2146        // it will be complicated to do so
2147        #[cfg(feature = "dtype-categorical")]
2148        let is_not_categorical_enum =
2149            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2150                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2151
2152        #[cfg(not(feature = "dtype-categorical"))]
2153        #[allow(non_upper_case_globals)]
2154        const is_not_categorical_enum: bool = true;
2155
2156        if by_column.len() == 1 && is_not_categorical_enum {
2157            let required_sorting = if sort_options.descending[0] {
2158                IsSorted::Descending
2159            } else {
2160                IsSorted::Ascending
2161            };
2162            // If null count is 0 then nulls_last doesnt matter
2163            // Safe to get value at last position since the dataframe is not empty (taken care above)
2164            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2165                && ((by_column[0].null_count() == 0)
2166                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2167                        == sort_options.nulls_last[0]);
2168
2169            if no_sorting_required {
2170                return if let Some((offset, len)) = slice {
2171                    Ok(self.slice(offset, len))
2172                } else {
2173                    Ok(self.clone())
2174                };
2175            }
2176        }
2177
2178        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2179
2180        // a lot of indirection in both sorting and take
2181        let mut df = self.clone();
2182        let df = df.as_single_chunk_par();
2183        let mut take = match (by_column.len(), has_nested) {
2184            (1, false) => {
2185                let s = &by_column[0];
2186                let options = SortOptions {
2187                    descending: sort_options.descending[0],
2188                    nulls_last: sort_options.nulls_last[0],
2189                    multithreaded: sort_options.multithreaded,
2190                    maintain_order: sort_options.maintain_order,
2191                    limit: sort_options.limit,
2192                };
2193                // fast path for a frame with a single series
2194                // no need to compute the sort indices and then take by these indices
2195                // simply sort and return as frame
2196                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2197                    let mut out = s.sort_with(options)?;
2198                    if let Some((offset, len)) = slice {
2199                        out = out.slice(offset, len);
2200                    }
2201                    return Ok(out.into_frame());
2202                }
2203                s.arg_sort(options)
2204            },
2205            _ => {
2206                if sort_options.nulls_last.iter().all(|&x| x)
2207                    || has_nested
2208                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2209                {
2210                    argsort_multiple_row_fmt(
2211                        &by_column,
2212                        sort_options.descending,
2213                        sort_options.nulls_last,
2214                        sort_options.multithreaded,
2215                    )?
2216                } else {
2217                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2218                    first
2219                        .as_materialized_series()
2220                        .arg_sort_multiple(&other, &sort_options)?
2221                }
2222            },
2223        };
2224
2225        if let Some((offset, len)) = slice {
2226            take = take.slice(offset, len);
2227        }
2228
2229        // SAFETY:
2230        // the created indices are in bounds
2231        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2232        set_sorted(&mut df);
2233        Ok(df)
2234    }
2235
2236    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2237    ///
2238    /// This dataframe does not necessarily have a specified schema and may be changed at any
2239    /// point. It is primarily used for debugging.
2240    pub fn _to_metadata(&self) -> DataFrame {
2241        let num_columns = self.columns.len();
2242
2243        let mut column_names =
2244            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2245        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2246        let mut sorted_asc_ca =
2247            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2248        let mut sorted_dsc_ca =
2249            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2250        let mut fast_explode_list_ca =
2251            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2252        let mut materialized_at_ca =
2253            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2254
2255        for col in &self.columns {
2256            let flags = col.get_flags();
2257
2258            let (repr, materialized_at) = match col {
2259                Column::Series(s) => ("series", s.materialized_at()),
2260                Column::Partitioned(_) => ("partitioned", None),
2261                Column::Scalar(_) => ("scalar", None),
2262            };
2263            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2264            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2265            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2266
2267            column_names.append_value(col.name().clone());
2268            repr_ca.append_value(repr);
2269            sorted_asc_ca.append_value(sorted_asc);
2270            sorted_dsc_ca.append_value(sorted_dsc);
2271            fast_explode_list_ca.append_value(fast_explode_list);
2272            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2273        }
2274
2275        unsafe {
2276            DataFrame::new_no_checks(
2277                self.width(),
2278                vec![
2279                    column_names.finish().into_column(),
2280                    repr_ca.finish().into_column(),
2281                    sorted_asc_ca.finish().into_column(),
2282                    sorted_dsc_ca.finish().into_column(),
2283                    fast_explode_list_ca.finish().into_column(),
2284                    materialized_at_ca.finish().into_column(),
2285                ],
2286            )
2287        }
2288    }
2289
2290    /// Return a sorted clone of this [`DataFrame`].
2291    ///
2292    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2293    /// # Example
2294    ///
2295    /// Sort by a single column with default options:
2296    /// ```
2297    /// # use polars_core::prelude::*;
2298    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2299    ///     df.sort(["sepal_width"], Default::default())
2300    /// }
2301    /// ```
2302    /// Sort by a single column with specific order:
2303    /// ```
2304    /// # use polars_core::prelude::*;
2305    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2306    ///     df.sort(
2307    ///         ["sepal_width"],
2308    ///         SortMultipleOptions::new()
2309    ///             .with_order_descending(descending)
2310    ///     )
2311    /// }
2312    /// ```
2313    /// Sort by multiple columns with specifying order for each column:
2314    /// ```
2315    /// # use polars_core::prelude::*;
2316    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2317    ///     df.sort(
2318    ///         ["sepal_width", "sepal_length"],
2319    ///         SortMultipleOptions::new()
2320    ///             .with_order_descending_multi([false, true])
2321    ///     )
2322    /// }
2323    /// ```
2324    /// See [`SortMultipleOptions`] for more options.
2325    ///
2326    /// Also see [`DataFrame::sort_in_place`].
2327    pub fn sort(
2328        &self,
2329        by: impl IntoVec<PlSmallStr>,
2330        sort_options: SortMultipleOptions,
2331    ) -> PolarsResult<Self> {
2332        let mut df = self.clone();
2333        df.sort_in_place(by, sort_options)?;
2334        Ok(df)
2335    }
2336
2337    /// Replace a column with a [`Series`].
2338    ///
2339    /// # Example
2340    ///
2341    /// ```rust
2342    /// # use polars_core::prelude::*;
2343    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2344    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2345    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2346    ///
2347    /// assert!(df.replace("Nation", s.clone()).is_err());
2348    /// assert!(df.replace("Country", s).is_ok());
2349    /// # Ok::<(), PolarsError>(())
2350    /// ```
2351    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2352        self.apply(column, |_| new_col.into_series())
2353    }
2354
2355    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2356    /// is that now the value of `column: &str` determines the name of the column and not the name
2357    /// of the `Series` passed to this method.
2358    pub fn replace_or_add<S: IntoSeries>(
2359        &mut self,
2360        column: PlSmallStr,
2361        new_col: S,
2362    ) -> PolarsResult<&mut Self> {
2363        let mut new_col = new_col.into_series();
2364        new_col.rename(column);
2365        self.with_column(new_col)
2366    }
2367
2368    /// Replace column at index `idx` with a [`Series`].
2369    ///
2370    /// # Example
2371    ///
2372    /// ```ignored
2373    /// # use polars_core::prelude::*;
2374    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2375    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2376    /// let mut df = DataFrame::new(vec![s0, s1])?;
2377    ///
2378    /// // Add 32 to get lowercase ascii values
2379    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2380    /// # Ok::<(), PolarsError>(())
2381    /// ```
2382    pub fn replace_column<C: IntoColumn>(
2383        &mut self,
2384        index: usize,
2385        new_column: C,
2386    ) -> PolarsResult<&mut Self> {
2387        polars_ensure!(
2388            index < self.width(),
2389            ShapeMismatch:
2390            "unable to replace at index {}, the DataFrame has only {} columns",
2391            index, self.width(),
2392        );
2393        let mut new_column = new_column.into_column();
2394        polars_ensure!(
2395            new_column.len() == self.height(),
2396            ShapeMismatch:
2397            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2398            new_column.len(), self.height(),
2399        );
2400        let old_col = &mut self.columns[index];
2401        mem::swap(old_col, &mut new_column);
2402        self.clear_schema();
2403        Ok(self)
2404    }
2405
2406    /// Apply a closure to a column. This is the recommended way to do in place modification.
2407    ///
2408    /// # Example
2409    ///
2410    /// ```rust
2411    /// # use polars_core::prelude::*;
2412    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2413    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2414    /// let mut df = DataFrame::new(vec![s0, s1])?;
2415    ///
2416    /// fn str_to_len(str_val: &Column) -> Column {
2417    ///     str_val.str()
2418    ///         .unwrap()
2419    ///         .into_iter()
2420    ///         .map(|opt_name: Option<&str>| {
2421    ///             opt_name.map(|name: &str| name.len() as u32)
2422    ///          })
2423    ///         .collect::<UInt32Chunked>()
2424    ///         .into_column()
2425    /// }
2426    ///
2427    /// // Replace the names column by the length of the names.
2428    /// df.apply("names", str_to_len);
2429    /// # Ok::<(), PolarsError>(())
2430    /// ```
2431    /// Results in:
2432    ///
2433    /// ```text
2434    /// +--------+-------+
2435    /// | foo    |       |
2436    /// | ---    | names |
2437    /// | str    | u32   |
2438    /// +========+=======+
2439    /// | "ham"  | 4     |
2440    /// +--------+-------+
2441    /// | "spam" | 6     |
2442    /// +--------+-------+
2443    /// | "egg"  | 3     |
2444    /// +--------+-------+
2445    /// ```
2446    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2447    where
2448        F: FnOnce(&Column) -> C,
2449        C: IntoColumn,
2450    {
2451        let idx = self.check_name_to_idx(name)?;
2452        self.apply_at_idx(idx, f)
2453    }
2454
2455    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2456    /// modification.
2457    ///
2458    /// # Example
2459    ///
2460    /// ```rust
2461    /// # use polars_core::prelude::*;
2462    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2463    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2464    /// let mut df = DataFrame::new(vec![s0, s1])?;
2465    ///
2466    /// // Add 32 to get lowercase ascii values
2467    /// df.apply_at_idx(1, |s| s + 32);
2468    /// # Ok::<(), PolarsError>(())
2469    /// ```
2470    /// Results in:
2471    ///
2472    /// ```text
2473    /// +--------+-------+
2474    /// | foo    | ascii |
2475    /// | ---    | ---   |
2476    /// | str    | i32   |
2477    /// +========+=======+
2478    /// | "ham"  | 102   |
2479    /// +--------+-------+
2480    /// | "spam" | 111   |
2481    /// +--------+-------+
2482    /// | "egg"  | 111   |
2483    /// +--------+-------+
2484    /// ```
2485    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2486    where
2487        F: FnOnce(&Column) -> C,
2488        C: IntoColumn,
2489    {
2490        let df_height = self.height();
2491        let width = self.width();
2492        let col = self.columns.get_mut(idx).ok_or_else(|| {
2493            polars_err!(
2494                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2495                idx, width
2496            )
2497        })?;
2498        let name = col.name().clone();
2499        let new_col = f(col).into_column();
2500        match new_col.len() {
2501            1 => {
2502                let new_col = new_col.new_from_index(0, df_height);
2503                let _ = mem::replace(col, new_col);
2504            },
2505            len if (len == df_height) => {
2506                let _ = mem::replace(col, new_col);
2507            },
2508            len => polars_bail!(
2509                ShapeMismatch:
2510                "resulting Series has length {} while the DataFrame has height {}",
2511                len, df_height
2512            ),
2513        }
2514
2515        // make sure the name remains the same after applying the closure
2516        unsafe {
2517            let col = self.columns.get_unchecked_mut(idx);
2518            col.rename(name);
2519        }
2520        Ok(self)
2521    }
2522
2523    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2524    /// modification.
2525    ///
2526    /// # Example
2527    ///
2528    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2529    ///
2530    /// ```rust
2531    /// # use polars_core::prelude::*;
2532    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2533    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2534    /// let mut df = DataFrame::new(vec![s0, s1])?;
2535    ///
2536    /// let idx = vec![0, 1, 4];
2537    ///
2538    /// df.try_apply("foo", |c| {
2539    ///     c.str()?
2540    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2541    /// });
2542    /// # Ok::<(), PolarsError>(())
2543    /// ```
2544    /// Results in:
2545    ///
2546    /// ```text
2547    /// +---------------------+--------+
2548    /// | foo                 | values |
2549    /// | ---                 | ---    |
2550    /// | str                 | i32    |
2551    /// +=====================+========+
2552    /// | "ham-is-modified"   | 1      |
2553    /// +---------------------+--------+
2554    /// | "spam-is-modified"  | 2      |
2555    /// +---------------------+--------+
2556    /// | "egg"               | 3      |
2557    /// +---------------------+--------+
2558    /// | "bacon"             | 4      |
2559    /// +---------------------+--------+
2560    /// | "quack-is-modified" | 5      |
2561    /// +---------------------+--------+
2562    /// ```
2563    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2564    where
2565        F: FnOnce(&Column) -> PolarsResult<C>,
2566        C: IntoColumn,
2567    {
2568        let width = self.width();
2569        let col = self.columns.get_mut(idx).ok_or_else(|| {
2570            polars_err!(
2571                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2572                idx, width
2573            )
2574        })?;
2575        let name = col.name().clone();
2576
2577        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2578
2579        // make sure the name remains the same after applying the closure
2580        unsafe {
2581            let col = self.columns.get_unchecked_mut(idx);
2582            col.rename(name);
2583        }
2584        Ok(self)
2585    }
2586
2587    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2588    /// modification.
2589    ///
2590    /// # Example
2591    ///
2592    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2593    ///
2594    /// ```rust
2595    /// # use polars_core::prelude::*;
2596    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2597    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2598    /// let mut df = DataFrame::new(vec![s0, s1])?;
2599    ///
2600    /// // create a mask
2601    /// let values = df.column("values")?.as_materialized_series();
2602    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2603    ///
2604    /// df.try_apply("foo", |c| {
2605    ///     c.str()?
2606    ///     .set(&mask, Some("not_within_bounds"))
2607    /// });
2608    /// # Ok::<(), PolarsError>(())
2609    /// ```
2610    /// Results in:
2611    ///
2612    /// ```text
2613    /// +---------------------+--------+
2614    /// | foo                 | values |
2615    /// | ---                 | ---    |
2616    /// | str                 | i32    |
2617    /// +=====================+========+
2618    /// | "not_within_bounds" | 1      |
2619    /// +---------------------+--------+
2620    /// | "spam"              | 2      |
2621    /// +---------------------+--------+
2622    /// | "egg"               | 3      |
2623    /// +---------------------+--------+
2624    /// | "bacon"             | 4      |
2625    /// +---------------------+--------+
2626    /// | "not_within_bounds" | 5      |
2627    /// +---------------------+--------+
2628    /// ```
2629    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2630    where
2631        F: FnOnce(&Series) -> PolarsResult<C>,
2632        C: IntoColumn,
2633    {
2634        let idx = self.try_get_column_index(column)?;
2635        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2636    }
2637
2638    /// Slice the [`DataFrame`] along the rows.
2639    ///
2640    /// # Example
2641    ///
2642    /// ```rust
2643    /// # use polars_core::prelude::*;
2644    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2645    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2646    /// let sl: DataFrame = df.slice(2, 3);
2647    ///
2648    /// assert_eq!(sl.shape(), (3, 2));
2649    /// println!("{}", sl);
2650    /// # Ok::<(), PolarsError>(())
2651    /// ```
2652    /// Output:
2653    /// ```text
2654    /// shape: (3, 2)
2655    /// +-------+-------+
2656    /// | Fruit | Color |
2657    /// | ---   | ---   |
2658    /// | str   | str   |
2659    /// +=======+=======+
2660    /// | Grape | White |
2661    /// +-------+-------+
2662    /// | Fig   | White |
2663    /// +-------+-------+
2664    /// | Fig   | Red   |
2665    /// +-------+-------+
2666    /// ```
2667    #[must_use]
2668    pub fn slice(&self, offset: i64, length: usize) -> Self {
2669        if offset == 0 && length == self.height() {
2670            return self.clone();
2671        }
2672        if length == 0 {
2673            return self.clear();
2674        }
2675        let col = self
2676            .columns
2677            .iter()
2678            .map(|s| s.slice(offset, length))
2679            .collect::<Vec<_>>();
2680
2681        let height = if let Some(fst) = col.first() {
2682            fst.len()
2683        } else {
2684            let (_, length) = slice_offsets(offset, length, self.height());
2685            length
2686        };
2687
2688        unsafe { DataFrame::new_no_checks(height, col) }
2689    }
2690
2691    /// Split [`DataFrame`] at the given `offset`.
2692    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2693        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2694
2695        let (idx, _) = slice_offsets(offset, 0, self.height());
2696
2697        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2698        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2699        (a, b)
2700    }
2701
2702    pub fn clear(&self) -> Self {
2703        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2704        unsafe { DataFrame::new_no_checks(0, col) }
2705    }
2706
2707    #[must_use]
2708    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2709        if offset == 0 && length == self.height() {
2710            return self.clone();
2711        }
2712        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2713        unsafe { DataFrame::new_no_checks(length, columns) }
2714    }
2715
2716    #[must_use]
2717    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2718        if offset == 0 && length == self.height() {
2719            return self.clone();
2720        }
2721        // @scalar-opt
2722        let columns = self._apply_columns(&|s| {
2723            let mut out = s.slice(offset, length);
2724            out.shrink_to_fit();
2725            out
2726        });
2727        unsafe { DataFrame::new_no_checks(length, columns) }
2728    }
2729
2730    /// Get the head of the [`DataFrame`].
2731    ///
2732    /// # Example
2733    ///
2734    /// ```rust
2735    /// # use polars_core::prelude::*;
2736    /// let countries: DataFrame =
2737    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2738    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2739    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2740    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2741    /// assert_eq!(countries.shape(), (5, 4));
2742    ///
2743    /// println!("{}", countries.head(Some(3)));
2744    /// # Ok::<(), PolarsError>(())
2745    /// ```
2746    ///
2747    /// Output:
2748    ///
2749    /// ```text
2750    /// shape: (3, 4)
2751    /// +--------------------+---------------+---------------+------------+
2752    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2753    /// | ---                | ---           | ---           | ---        |
2754    /// | i32                | str           | str           | str        |
2755    /// +====================+===============+===============+============+
2756    /// | 1                  | North America | United States | Washington |
2757    /// +--------------------+---------------+---------------+------------+
2758    /// | 2                  | Asia          | China         | Beijing    |
2759    /// +--------------------+---------------+---------------+------------+
2760    /// | 3                  | Asia          | Japan         | Tokyo      |
2761    /// +--------------------+---------------+---------------+------------+
2762    /// ```
2763    #[must_use]
2764    pub fn head(&self, length: Option<usize>) -> Self {
2765        let col = self
2766            .columns
2767            .iter()
2768            .map(|c| c.head(length))
2769            .collect::<Vec<_>>();
2770
2771        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2772        let height = usize::min(height, self.height());
2773        unsafe { DataFrame::new_no_checks(height, col) }
2774    }
2775
2776    /// Get the tail of the [`DataFrame`].
2777    ///
2778    /// # Example
2779    ///
2780    /// ```rust
2781    /// # use polars_core::prelude::*;
2782    /// let countries: DataFrame =
2783    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2784    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2785    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2786    /// assert_eq!(countries.shape(), (5, 3));
2787    ///
2788    /// println!("{}", countries.tail(Some(2)));
2789    /// # Ok::<(), PolarsError>(())
2790    /// ```
2791    ///
2792    /// Output:
2793    ///
2794    /// ```text
2795    /// shape: (2, 3)
2796    /// +-------------+--------------------+---------+
2797    /// | Rank (2021) | Apple Price (€/kg) | Country |
2798    /// | ---         | ---                | ---     |
2799    /// | i32         | f64                | str     |
2800    /// +=============+====================+=========+
2801    /// | 108         | 0.63               | Syria   |
2802    /// +-------------+--------------------+---------+
2803    /// | 109         | 0.63               | Turkey  |
2804    /// +-------------+--------------------+---------+
2805    /// ```
2806    #[must_use]
2807    pub fn tail(&self, length: Option<usize>) -> Self {
2808        let col = self
2809            .columns
2810            .iter()
2811            .map(|c| c.tail(length))
2812            .collect::<Vec<_>>();
2813
2814        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2815        let height = usize::min(height, self.height());
2816        unsafe { DataFrame::new_no_checks(height, col) }
2817    }
2818
2819    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2820    ///
2821    /// # Panics
2822    ///
2823    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2824    ///
2825    /// This responsibility is left to the caller as we don't want to take mutable references here,
2826    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2827    /// as well.
2828    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2829        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2830        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2831        // as we must allocate arrow strings/binaries.
2832        let must_convert = compat_level.0 == 0;
2833        let parallel = parallel
2834            && must_convert
2835            && self.columns.len() > 1
2836            && self
2837                .columns
2838                .iter()
2839                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2840
2841        RecordBatchIter {
2842            columns: &self.columns,
2843            schema: Arc::new(
2844                self.columns
2845                    .iter()
2846                    .map(|c| c.field().to_arrow(compat_level))
2847                    .collect(),
2848            ),
2849            idx: 0,
2850            n_chunks: self.first_col_n_chunks(),
2851            compat_level,
2852            parallel,
2853        }
2854    }
2855
2856    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2857    ///
2858    /// # Panics
2859    ///
2860    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2861    ///
2862    /// This responsibility is left to the caller as we don't want to take mutable references here,
2863    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2864    /// as well.
2865    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2866        debug_assert!(!self.should_rechunk());
2867        PhysRecordBatchIter {
2868            schema: Arc::new(
2869                self.get_columns()
2870                    .iter()
2871                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2872                    .collect(),
2873            ),
2874            arr_iters: self
2875                .materialized_column_iter()
2876                .map(|s| s.chunks().iter())
2877                .collect(),
2878        }
2879    }
2880
2881    /// Get a [`DataFrame`] with all the columns in reversed order.
2882    #[must_use]
2883    pub fn reverse(&self) -> Self {
2884        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2885        unsafe { DataFrame::new_no_checks(self.height(), col) }
2886    }
2887
2888    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2889    /// with `Nones`.
2890    ///
2891    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2892    #[must_use]
2893    pub fn shift(&self, periods: i64) -> Self {
2894        let col = self._apply_columns_par(&|s| s.shift(periods));
2895        unsafe { DataFrame::new_no_checks(self.height(), col) }
2896    }
2897
2898    /// Replace None values with one of the following strategies:
2899    /// * Forward fill (replace None with the previous value)
2900    /// * Backward fill (replace None with the next value)
2901    /// * Mean fill (replace None with the mean of the whole array)
2902    /// * Min fill (replace None with the minimum of the whole array)
2903    /// * Max fill (replace None with the maximum of the whole array)
2904    ///
2905    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2906    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2907        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2908
2909        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2910    }
2911
2912    /// Pipe different functions/ closure operations that work on a DataFrame together.
2913    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2914    where
2915        F: Fn(DataFrame) -> PolarsResult<B>,
2916    {
2917        f(self)
2918    }
2919
2920    /// Pipe different functions/ closure operations that work on a DataFrame together.
2921    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2922    where
2923        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2924    {
2925        f(self)
2926    }
2927
2928    /// Pipe different functions/ closure operations that work on a DataFrame together.
2929    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2930    where
2931        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2932    {
2933        f(self, args)
2934    }
2935
2936    /// Drop duplicate rows from a [`DataFrame`].
2937    /// *This fails when there is a column of type List in DataFrame*
2938    ///
2939    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2940    ///
2941    /// # Example
2942    ///
2943    /// ```no_run
2944    /// # use polars_core::prelude::*;
2945    /// let df = df! {
2946    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2947    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2948    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2949    ///           }?;
2950    ///
2951    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2952    /// # Ok::<(), PolarsError>(())
2953    /// ```
2954    /// Returns
2955    ///
2956    /// ```text
2957    /// +-----+-----+-----+
2958    /// | flt | int | str |
2959    /// | --- | --- | --- |
2960    /// | f64 | i32 | str |
2961    /// +=====+=====+=====+
2962    /// | 1   | 1   | "a" |
2963    /// +-----+-----+-----+
2964    /// | 2   | 2   | "b" |
2965    /// +-----+-----+-----+
2966    /// | 3   | 3   | "c" |
2967    /// +-----+-----+-----+
2968    /// ```
2969    #[cfg(feature = "algorithm_group_by")]
2970    pub fn unique_stable(
2971        &self,
2972        subset: Option<&[String]>,
2973        keep: UniqueKeepStrategy,
2974        slice: Option<(i64, usize)>,
2975    ) -> PolarsResult<DataFrame> {
2976        self.unique_impl(
2977            true,
2978            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2979            keep,
2980            slice,
2981        )
2982    }
2983
2984    /// Unstable distinct. See [`DataFrame::unique_stable`].
2985    #[cfg(feature = "algorithm_group_by")]
2986    pub fn unique<I, S>(
2987        &self,
2988        subset: Option<&[String]>,
2989        keep: UniqueKeepStrategy,
2990        slice: Option<(i64, usize)>,
2991    ) -> PolarsResult<DataFrame> {
2992        self.unique_impl(
2993            false,
2994            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2995            keep,
2996            slice,
2997        )
2998    }
2999
3000    #[cfg(feature = "algorithm_group_by")]
3001    pub fn unique_impl(
3002        &self,
3003        maintain_order: bool,
3004        subset: Option<Vec<PlSmallStr>>,
3005        keep: UniqueKeepStrategy,
3006        slice: Option<(i64, usize)>,
3007    ) -> PolarsResult<Self> {
3008        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3009        let mut df = self.clone();
3010        // take on multiple chunks is terrible
3011        df.as_single_chunk_par();
3012
3013        let columns = match (keep, maintain_order) {
3014            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3015                let gb = df.group_by_stable(names)?;
3016                let groups = gb.get_groups();
3017                let (offset, len) = slice.unwrap_or((0, groups.len()));
3018                let groups = groups.slice(offset, len);
3019                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3020            },
3021            (UniqueKeepStrategy::Last, true) => {
3022                // maintain order by last values, so the sorted groups are not correct as they
3023                // are sorted by the first value
3024                let gb = df.group_by_stable(names)?;
3025                let groups = gb.get_groups();
3026
3027                let last_idx: NoNull<IdxCa> = groups
3028                    .iter()
3029                    .map(|g| match g {
3030                        GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3031                        GroupsIndicator::Slice([first, len]) => first + len - 1,
3032                    })
3033                    .collect();
3034
3035                let mut last_idx = last_idx.into_inner().sort(false);
3036
3037                if let Some((offset, len)) = slice {
3038                    last_idx = last_idx.slice(offset, len);
3039                }
3040
3041                let last_idx = NoNull::new(last_idx);
3042                let out = unsafe { df.take_unchecked(&last_idx) };
3043                return Ok(out);
3044            },
3045            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3046                let gb = df.group_by(names)?;
3047                let groups = gb.get_groups();
3048                let (offset, len) = slice.unwrap_or((0, groups.len()));
3049                let groups = groups.slice(offset, len);
3050                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3051            },
3052            (UniqueKeepStrategy::Last, false) => {
3053                let gb = df.group_by(names)?;
3054                let groups = gb.get_groups();
3055                let (offset, len) = slice.unwrap_or((0, groups.len()));
3056                let groups = groups.slice(offset, len);
3057                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3058            },
3059            (UniqueKeepStrategy::None, _) => {
3060                let df_part = df.select(names)?;
3061                let mask = df_part.is_unique()?;
3062                let mut filtered = df.filter(&mask)?;
3063
3064                if let Some((offset, len)) = slice {
3065                    filtered = filtered.slice(offset, len);
3066                }
3067                return Ok(filtered);
3068            },
3069        };
3070        let height = Self::infer_height(&columns);
3071        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3072    }
3073
3074    /// Get a mask of all the unique rows in the [`DataFrame`].
3075    ///
3076    /// # Example
3077    ///
3078    /// ```no_run
3079    /// # use polars_core::prelude::*;
3080    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3081    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3082    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3083    ///
3084    /// assert!(ca.all());
3085    /// # Ok::<(), PolarsError>(())
3086    /// ```
3087    #[cfg(feature = "algorithm_group_by")]
3088    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3089        let gb = self.group_by(self.get_column_names_owned())?;
3090        let groups = gb.get_groups();
3091        Ok(is_unique_helper(
3092            groups,
3093            self.height() as IdxSize,
3094            true,
3095            false,
3096        ))
3097    }
3098
3099    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3100    ///
3101    /// # Example
3102    ///
3103    /// ```no_run
3104    /// # use polars_core::prelude::*;
3105    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3106    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3107    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3108    ///
3109    /// assert!(!ca.all());
3110    /// # Ok::<(), PolarsError>(())
3111    /// ```
3112    #[cfg(feature = "algorithm_group_by")]
3113    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3114        let gb = self.group_by(self.get_column_names_owned())?;
3115        let groups = gb.get_groups();
3116        Ok(is_unique_helper(
3117            groups,
3118            self.height() as IdxSize,
3119            false,
3120            true,
3121        ))
3122    }
3123
3124    /// Create a new [`DataFrame`] that shows the null counts per column.
3125    #[must_use]
3126    pub fn null_count(&self) -> Self {
3127        let cols = self
3128            .columns
3129            .iter()
3130            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3131            .collect();
3132        unsafe { Self::new_no_checks(1, cols) }
3133    }
3134
3135    /// Hash and combine the row values
3136    #[cfg(feature = "row_hash")]
3137    pub fn hash_rows(
3138        &mut self,
3139        hasher_builder: Option<PlSeedableRandomStateQuality>,
3140    ) -> PolarsResult<UInt64Chunked> {
3141        let dfs = split_df(self, POOL.current_num_threads(), false);
3142        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3143
3144        let mut iter = cas.into_iter();
3145        let mut acc_ca = iter.next().unwrap();
3146        for ca in iter {
3147            acc_ca.append(&ca)?;
3148        }
3149        Ok(acc_ca.rechunk().into_owned())
3150    }
3151
3152    /// Get the supertype of the columns in this DataFrame
3153    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3154        self.columns
3155            .iter()
3156            .map(|s| Ok(s.dtype().clone()))
3157            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3158    }
3159
3160    /// Take by index values given by the slice `idx`.
3161    /// # Warning
3162    /// Be careful with allowing threads when calling this in a large hot loop
3163    /// every thread split may be on rayon stack and lead to SO
3164    #[doc(hidden)]
3165    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3166        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3167    }
3168
3169    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3170    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3171    ///
3172    /// # Warning
3173    /// Be careful with allowing threads when calling this in a large hot loop
3174    /// every thread split may be on rayon stack and lead to SO
3175    #[doc(hidden)]
3176    pub unsafe fn _take_unchecked_slice_sorted(
3177        &self,
3178        idx: &[IdxSize],
3179        allow_threads: bool,
3180        sorted: IsSorted,
3181    ) -> Self {
3182        #[cfg(debug_assertions)]
3183        {
3184            if idx.len() > 2 {
3185                match sorted {
3186                    IsSorted::Ascending => {
3187                        assert!(idx[0] <= idx[idx.len() - 1]);
3188                    },
3189                    IsSorted::Descending => {
3190                        assert!(idx[0] >= idx[idx.len() - 1]);
3191                    },
3192                    _ => {},
3193                }
3194            }
3195        }
3196        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3197        ca.set_sorted_flag(sorted);
3198        self.take_unchecked_impl(&ca, allow_threads)
3199    }
3200
3201    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3202    #[doc(hidden)]
3203    pub fn _partition_by_impl(
3204        &self,
3205        cols: &[PlSmallStr],
3206        stable: bool,
3207        include_key: bool,
3208        parallel: bool,
3209    ) -> PolarsResult<Vec<DataFrame>> {
3210        let selected_keys = self.select_columns(cols.iter().cloned())?;
3211        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3212        let groups = groups.take_groups();
3213
3214        // drop key columns prior to calculation if requested
3215        let df = if include_key {
3216            self.clone()
3217        } else {
3218            self.drop_many(cols.iter().cloned())
3219        };
3220
3221        if parallel {
3222            // don't parallelize this
3223            // there is a lot of parallelization in take and this may easily SO
3224            POOL.install(|| {
3225                match groups.as_ref() {
3226                    GroupsType::Idx(idx) => {
3227                        // Rechunk as the gather may rechunk for every group #17562.
3228                        let mut df = df.clone();
3229                        df.as_single_chunk_par();
3230                        Ok(idx
3231                            .into_par_iter()
3232                            .map(|(_, group)| {
3233                                // groups are in bounds
3234                                unsafe {
3235                                    df._take_unchecked_slice_sorted(
3236                                        group,
3237                                        false,
3238                                        IsSorted::Ascending,
3239                                    )
3240                                }
3241                            })
3242                            .collect())
3243                    },
3244                    GroupsType::Slice { groups, .. } => Ok(groups
3245                        .into_par_iter()
3246                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3247                        .collect()),
3248                }
3249            })
3250        } else {
3251            match groups.as_ref() {
3252                GroupsType::Idx(idx) => {
3253                    // Rechunk as the gather may rechunk for every group #17562.
3254                    let mut df = df.clone();
3255                    df.as_single_chunk();
3256                    Ok(idx
3257                        .into_iter()
3258                        .map(|(_, group)| {
3259                            // groups are in bounds
3260                            unsafe {
3261                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3262                            }
3263                        })
3264                        .collect())
3265                },
3266                GroupsType::Slice { groups, .. } => Ok(groups
3267                    .iter()
3268                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3269                    .collect()),
3270            }
3271        }
3272    }
3273
3274    /// Split into multiple DataFrames partitioned by groups
3275    #[cfg(feature = "partition_by")]
3276    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3277    where
3278        I: IntoIterator<Item = S>,
3279        S: Into<PlSmallStr>,
3280    {
3281        let cols = cols
3282            .into_iter()
3283            .map(Into::into)
3284            .collect::<Vec<PlSmallStr>>();
3285        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3286    }
3287
3288    /// Split into multiple DataFrames partitioned by groups
3289    /// Order of the groups are maintained.
3290    #[cfg(feature = "partition_by")]
3291    pub fn partition_by_stable<I, S>(
3292        &self,
3293        cols: I,
3294        include_key: bool,
3295    ) -> PolarsResult<Vec<DataFrame>>
3296    where
3297        I: IntoIterator<Item = S>,
3298        S: Into<PlSmallStr>,
3299    {
3300        let cols = cols
3301            .into_iter()
3302            .map(Into::into)
3303            .collect::<Vec<PlSmallStr>>();
3304        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3305    }
3306
3307    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3308    /// inserted as columns.
3309    #[cfg(feature = "dtype-struct")]
3310    pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3311        let cols = cols.into_vec();
3312        self.unnest_impl(cols.into_iter().collect())
3313    }
3314
3315    #[cfg(feature = "dtype-struct")]
3316    fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3317        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3318        let mut count = 0;
3319        for s in &self.columns {
3320            if cols.contains(s.name()) {
3321                let ca = s.struct_()?.clone();
3322                new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3323                count += 1;
3324            } else {
3325                new_cols.push(s.clone())
3326            }
3327        }
3328        if count != cols.len() {
3329            // one or more columns not found
3330            // the code below will return an error with the missing name
3331            let schema = self.schema();
3332            for col in cols {
3333                let _ = schema
3334                    .get(col.as_str())
3335                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3336            }
3337        }
3338        DataFrame::new(new_cols)
3339    }
3340
3341    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3342        cols.first().map_or(0, Column::len)
3343    }
3344
3345    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3346        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3347        // append_chunk or something like this. It is just quite difficult to make that safe.
3348        let df = DataFrame::from(rb);
3349        polars_ensure!(
3350            self.schema() == df.schema(),
3351            SchemaMismatch: "cannot append record batch with different schema\n\n
3352        Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3353        );
3354        self.vstack_mut_owned_unchecked(df);
3355        Ok(())
3356    }
3357}
3358
3359pub struct RecordBatchIter<'a> {
3360    columns: &'a Vec<Column>,
3361    schema: ArrowSchemaRef,
3362    idx: usize,
3363    n_chunks: usize,
3364    compat_level: CompatLevel,
3365    parallel: bool,
3366}
3367
3368impl Iterator for RecordBatchIter<'_> {
3369    type Item = RecordBatch;
3370
3371    fn next(&mut self) -> Option<Self::Item> {
3372        if self.idx >= self.n_chunks {
3373            return None;
3374        }
3375
3376        // Create a batch of the columns with the same chunk no.
3377        let batch_cols: Vec<ArrayRef> = if self.parallel {
3378            let iter = self
3379                .columns
3380                .par_iter()
3381                .map(Column::as_materialized_series)
3382                .map(|s| s.to_arrow(self.idx, self.compat_level));
3383            POOL.install(|| iter.collect())
3384        } else {
3385            self.columns
3386                .iter()
3387                .map(Column::as_materialized_series)
3388                .map(|s| s.to_arrow(self.idx, self.compat_level))
3389                .collect()
3390        };
3391        self.idx += 1;
3392
3393        let length = batch_cols.first().map_or(0, |arr| arr.len());
3394        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3395    }
3396
3397    fn size_hint(&self) -> (usize, Option<usize>) {
3398        let n = self.n_chunks - self.idx;
3399        (n, Some(n))
3400    }
3401}
3402
3403pub struct PhysRecordBatchIter<'a> {
3404    schema: ArrowSchemaRef,
3405    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3406}
3407
3408impl Iterator for PhysRecordBatchIter<'_> {
3409    type Item = RecordBatch;
3410
3411    fn next(&mut self) -> Option<Self::Item> {
3412        let arrs = self
3413            .arr_iters
3414            .iter_mut()
3415            .map(|phys_iter| phys_iter.next().cloned())
3416            .collect::<Option<Vec<_>>>()?;
3417
3418        let length = arrs.first().map_or(0, |arr| arr.len());
3419        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3420    }
3421
3422    fn size_hint(&self) -> (usize, Option<usize>) {
3423        if let Some(iter) = self.arr_iters.first() {
3424            iter.size_hint()
3425        } else {
3426            (0, None)
3427        }
3428    }
3429}
3430
3431impl Default for DataFrame {
3432    fn default() -> Self {
3433        DataFrame::empty()
3434    }
3435}
3436
3437impl From<DataFrame> for Vec<Column> {
3438    fn from(df: DataFrame) -> Self {
3439        df.columns
3440    }
3441}
3442
3443// utility to test if we can vstack/extend the columns
3444fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3445    polars_ensure!(
3446        left.name() == right.name(),
3447        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3448        left.name(), right.name(),
3449    );
3450    Ok(())
3451}
3452
3453#[cfg(test)]
3454mod test {
3455    use super::*;
3456
3457    fn create_frame() -> DataFrame {
3458        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3459        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3460        DataFrame::new(vec![s0, s1]).unwrap()
3461    }
3462
3463    #[test]
3464    #[cfg_attr(miri, ignore)]
3465    fn test_recordbatch_iterator() {
3466        let df = df!(
3467            "foo" => [1, 2, 3, 4, 5]
3468        )
3469        .unwrap();
3470        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3471        assert_eq!(5, iter.next().unwrap().len());
3472        assert!(iter.next().is_none());
3473    }
3474
3475    #[test]
3476    #[cfg_attr(miri, ignore)]
3477    fn test_select() {
3478        let df = create_frame();
3479        assert_eq!(
3480            df.column("days")
3481                .unwrap()
3482                .as_series()
3483                .unwrap()
3484                .equal(1)
3485                .unwrap()
3486                .sum(),
3487            Some(1)
3488        );
3489    }
3490
3491    #[test]
3492    #[cfg_attr(miri, ignore)]
3493    fn test_filter_broadcast_on_string_col() {
3494        let col_name = "some_col";
3495        let v = vec!["test".to_string()];
3496        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3497        let mut df = DataFrame::new(vec![s0]).unwrap();
3498
3499        df = df
3500            .filter(
3501                &df.column(col_name)
3502                    .unwrap()
3503                    .as_materialized_series()
3504                    .equal("")
3505                    .unwrap(),
3506            )
3507            .unwrap();
3508        assert_eq!(
3509            df.column(col_name)
3510                .unwrap()
3511                .as_materialized_series()
3512                .n_chunks(),
3513            1
3514        );
3515    }
3516
3517    #[test]
3518    #[cfg_attr(miri, ignore)]
3519    fn test_filter_broadcast_on_list_col() {
3520        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3521        let ll: ListChunked = [&s1].iter().copied().collect();
3522
3523        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3524        let new = ll.filter(&mask).unwrap();
3525
3526        assert_eq!(new.chunks.len(), 1);
3527        assert_eq!(new.len(), 0);
3528    }
3529
3530    #[test]
3531    fn slice() {
3532        let df = create_frame();
3533        let sliced_df = df.slice(0, 2);
3534        assert_eq!(sliced_df.shape(), (2, 2));
3535    }
3536
3537    #[test]
3538    fn rechunk_false() {
3539        let df = create_frame();
3540        assert!(!df.should_rechunk())
3541    }
3542
3543    #[test]
3544    fn rechunk_true() -> PolarsResult<()> {
3545        let mut base = df!(
3546            "a" => [1, 2, 3],
3547            "b" => [1, 2, 3]
3548        )?;
3549
3550        // Create a series with multiple chunks
3551        let mut s = Series::new("foo".into(), 0..2);
3552        let s2 = Series::new("bar".into(), 0..1);
3553        s.append(&s2)?;
3554
3555        // Append series to frame
3556        let out = base.with_column(s)?;
3557
3558        // Now we should rechunk
3559        assert!(out.should_rechunk());
3560        Ok(())
3561    }
3562
3563    #[test]
3564    fn test_duplicate_column() {
3565        let mut df = df! {
3566            "foo" => [1, 2, 3]
3567        }
3568        .unwrap();
3569        // check if column is replaced
3570        assert!(
3571            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3572                .is_ok()
3573        );
3574        assert!(
3575            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3576                .is_ok()
3577        );
3578        assert!(df.column("bar").is_ok())
3579    }
3580
3581    #[test]
3582    #[cfg_attr(miri, ignore)]
3583    fn distinct() {
3584        let df = df! {
3585            "flt" => [1., 1., 2., 2., 3., 3.],
3586            "int" => [1, 1, 2, 2, 3, 3, ],
3587            "str" => ["a", "a", "b", "b", "c", "c"]
3588        }
3589        .unwrap();
3590        let df = df
3591            .unique_stable(None, UniqueKeepStrategy::First, None)
3592            .unwrap()
3593            .sort(["flt"], SortMultipleOptions::default())
3594            .unwrap();
3595        let valid = df! {
3596            "flt" => [1., 2., 3.],
3597            "int" => [1, 2, 3],
3598            "str" => ["a", "b", "c"]
3599        }
3600        .unwrap();
3601        assert!(df.equals(&valid));
3602    }
3603
3604    #[test]
3605    fn test_vstack() {
3606        // check that it does not accidentally rechunks
3607        let mut df = df! {
3608            "flt" => [1., 1., 2., 2., 3., 3.],
3609            "int" => [1, 1, 2, 2, 3, 3, ],
3610            "str" => ["a", "a", "b", "b", "c", "c"]
3611        }
3612        .unwrap();
3613
3614        df.vstack_mut(&df.slice(0, 3)).unwrap();
3615        assert_eq!(df.first_col_n_chunks(), 2)
3616    }
3617
3618    #[test]
3619    fn test_vstack_on_empty_dataframe() {
3620        let mut df = DataFrame::empty();
3621
3622        let df_data = df! {
3623            "flt" => [1., 1., 2., 2., 3., 3.],
3624            "int" => [1, 1, 2, 2, 3, 3, ],
3625            "str" => ["a", "a", "b", "b", "c", "c"]
3626        }
3627        .unwrap();
3628
3629        df.vstack_mut(&df_data).unwrap();
3630        assert_eq!(df.height, 6)
3631    }
3632
3633    #[test]
3634    fn test_replace_or_add() -> PolarsResult<()> {
3635        let mut df = df!(
3636            "a" => [1, 2, 3],
3637            "b" => [1, 2, 3]
3638        )?;
3639
3640        // check that the new column is "c" and not "bar".
3641        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3642
3643        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3644        Ok(())
3645    }
3646
3647    #[test]
3648    fn test_unique_keep_none_with_slice() {
3649        let df = df! {
3650            "x" => [1, 2, 3, 2, 1]
3651        }
3652        .unwrap();
3653        let out = df
3654            .unique_stable(
3655                Some(&["x".to_string()][..]),
3656                UniqueKeepStrategy::None,
3657                Some((0, 2)),
3658            )
3659            .unwrap();
3660        let expected = df! {
3661            "x" => [3]
3662        }
3663        .unwrap();
3664        assert!(out.equals(&expected));
3665    }
3666}
polars_core/frame/mod.rs

polars_core/frame/
mod.rs