polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54 /// Keep the first unique row.
55 First,
56 /// Keep the last unique row.
57 Last,
58 /// Keep None of the unique rows.
59 None,
60 /// Keep any of the unique rows
61 /// This allows more optimizations
62 #[default]
63 Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68 F: for<'a> FnMut(&'a T) -> &'a str,
69{
70 // Always unique.
71 if items.len() <= 1 {
72 return Ok(());
73 }
74
75 if items.len() <= 4 {
76 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77 for i in 0..items.len() - 1 {
78 let name = get_name(&items[i]);
79 for other in items.iter().skip(i + 1) {
80 if name == get_name(other) {
81 polars_bail!(duplicate = name);
82 }
83 }
84 }
85 } else {
86 let mut names = PlHashSet::with_capacity(items.len());
87 for item in items {
88 let name = get_name(item);
89 if !names.insert(name) {
90 polars_bail!(duplicate = name);
91 }
92 }
93 }
94 Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*; if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138/// "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153/// "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165/// "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173 height: usize,
174 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175 pub(crate) columns: Vec<Column>,
176
177 /// A cached schema. This might not give correct results if the DataFrame was modified in place
178 /// between schema and reading.
179 cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183 pub fn clear_schema(&mut self) {
184 self.cached_schema = OnceLock::new();
185 }
186
187 #[inline]
188 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189 self.columns.iter()
190 }
191
192 #[inline]
193 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194 self.columns.iter().map(Column::as_materialized_series)
195 }
196
197 #[inline]
198 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199 self.columns.par_iter().map(Column::as_materialized_series)
200 }
201
202 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203 ///
204 /// # Implementation
205 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208 ///
209 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210 /// However, this function will yield a smaller number. This is because this function returns
211 /// the visible size of the buffer, not its total capacity.
212 ///
213 /// FFI buffers are included in this estimation.
214 pub fn estimated_size(&self) -> usize {
215 self.columns.iter().map(Column::estimated_size).sum()
216 }
217
218 // Reduce monomorphization.
219 fn try_apply_columns(
220 &self,
221 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222 ) -> PolarsResult<Vec<Column>> {
223 self.columns.iter().map(func).collect()
224 }
225 // Reduce monomorphization.
226 pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227 self.columns.iter().map(func).collect()
228 }
229 // Reduce monomorphization.
230 fn try_apply_columns_par(
231 &self,
232 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233 ) -> PolarsResult<Vec<Column>> {
234 POOL.install(|| self.columns.par_iter().map(func).collect())
235 }
236 // Reduce monomorphization.
237 pub fn _apply_columns_par(
238 &self,
239 func: &(dyn Fn(&Column) -> Column + Send + Sync),
240 ) -> Vec<Column> {
241 POOL.install(|| self.columns.par_iter().map(func).collect())
242 }
243
244 /// Get the index of the column.
245 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246 self.get_column_index(name)
247 .ok_or_else(|| polars_err!(col_not_found = name))
248 }
249
250 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251 polars_ensure!(
252 self.columns.iter().all(|s| s.name().as_str() != name),
253 Duplicate: "column with name {:?} is already present in the DataFrame", name
254 );
255 Ok(())
256 }
257
258 /// Reserve additional slots into the chunks of the series.
259 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260 for s in &mut self.columns {
261 if let Column::Series(s) = s {
262 // SAFETY:
263 // do not modify the data, simply resize.
264 unsafe { s.chunks_mut().reserve(additional) }
265 }
266 }
267 }
268
269 /// Create a DataFrame from a Vector of Series.
270 ///
271 /// Errors if a column names are not unique, or if heights are not all equal.
272 ///
273 /// # Example
274 ///
275 /// ```
276 /// # use polars_core::prelude::*;
277 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279 ///
280 /// let df = DataFrame::new(vec![s0, s1])?;
281 /// # Ok::<(), PolarsError>(())
282 /// ```
283 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284 DataFrame::validate_columns_slice(&columns)
285 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287 }
288
289 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290 for col in &columns {
291 polars_ensure!(
292 col.len() == height,
293 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294 columns[0].name(), height, col.name(), col.len()
295 );
296 }
297
298 Ok(DataFrame {
299 height,
300 columns,
301 cached_schema: OnceLock::new(),
302 })
303 }
304
305 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306 /// columns to match the other columns.
307 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308 // The length of the longest non-unit length column determines the
309 // broadcast length. If all columns are unit-length the broadcast length
310 // is one.
311 let broadcast_len = columns
312 .iter()
313 .map(|s| s.len())
314 .filter(|l| *l != 1)
315 .max()
316 .unwrap_or(1);
317 Self::new_with_broadcast_len(columns, broadcast_len)
318 }
319
320 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321 /// columns to broadcast_len.
322 pub fn new_with_broadcast_len(
323 columns: Vec<Column>,
324 broadcast_len: usize,
325 ) -> PolarsResult<Self> {
326 ensure_names_unique(&columns, |s| s.name().as_str())?;
327 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328 }
329
330 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331 /// columns to match the other columns.
332 ///
333 /// # Safety
334 /// Does not check that the column names are unique (which they must be).
335 pub unsafe fn new_with_broadcast_no_namecheck(
336 mut columns: Vec<Column>,
337 broadcast_len: usize,
338 ) -> PolarsResult<Self> {
339 for col in &mut columns {
340 // Length not equal to the broadcast len, needs broadcast or is an error.
341 let len = col.len();
342 if len != broadcast_len {
343 if len != 1 {
344 let name = col.name().to_owned();
345 let extra_info =
346 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347 format!(" (matching column '{}')", c.name())
348 } else {
349 String::new()
350 };
351 polars_bail!(
352 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353 );
354 }
355 *col = col.new_from_index(0, broadcast_len);
356 }
357 }
358
359 let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362 }
363
364 pub fn new_from_index(&self, index: usize, height: usize) -> Self {
365 let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
366 unsafe { Self::new_no_checks(height, cols.collect()) }
367 }
368
369 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
370 ///
371 /// # Example
372 ///
373 /// ```rust
374 /// use polars_core::prelude::DataFrame;
375 /// static EMPTY: DataFrame = DataFrame::empty();
376 /// ```
377 pub const fn empty() -> Self {
378 Self::empty_with_height(0)
379 }
380
381 /// Creates an empty `DataFrame` with a specific `height`.
382 pub const fn empty_with_height(height: usize) -> Self {
383 DataFrame {
384 height,
385 columns: vec![],
386 cached_schema: OnceLock::new(),
387 }
388 }
389
390 /// Create an empty `DataFrame` with empty columns as per the `schema`.
391 pub fn empty_with_schema(schema: &Schema) -> Self {
392 let cols = schema
393 .iter()
394 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
395 .collect();
396 unsafe { DataFrame::new_no_checks(0, cols) }
397 }
398
399 /// Create an empty `DataFrame` with empty columns as per the `schema`.
400 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
401 let cols = schema
402 .iter_values()
403 .map(|fld| {
404 Column::from(Series::new_empty(
405 fld.name.clone(),
406 &(DataType::from_arrow_field(fld)),
407 ))
408 })
409 .collect();
410 unsafe { DataFrame::new_no_checks(0, cols) }
411 }
412
413 /// Create a new `DataFrame` with the given schema, only containing nulls.
414 pub fn full_null(schema: &Schema, height: usize) -> Self {
415 let columns = schema
416 .iter_fields()
417 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
418 .collect();
419 unsafe { DataFrame::new_no_checks(height, columns) }
420 }
421
422 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
423 ///
424 /// # Example
425 ///
426 /// ```rust
427 /// # use polars_core::prelude::*;
428 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
429 /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
430 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
431 ///
432 /// assert_eq!(df.pop(), Some(s2));
433 /// assert_eq!(df.pop(), Some(s1));
434 /// assert_eq!(df.pop(), None);
435 /// assert!(df.is_empty());
436 /// # Ok::<(), PolarsError>(())
437 /// ```
438 pub fn pop(&mut self) -> Option<Column> {
439 self.clear_schema();
440
441 self.columns.pop()
442 }
443
444 /// Add a new column at index 0 that counts the rows.
445 ///
446 /// # Example
447 ///
448 /// ```
449 /// # use polars_core::prelude::*;
450 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
451 /// assert_eq!(df1.shape(), (4, 1));
452 ///
453 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
454 /// assert_eq!(df2.shape(), (4, 2));
455 /// println!("{}", df2);
456 ///
457 /// # Ok::<(), PolarsError>(())
458 /// ```
459 ///
460 /// Output:
461 ///
462 /// ```text
463 /// shape: (4, 2)
464 /// +-----+----------+
465 /// | Id | Name |
466 /// | --- | --- |
467 /// | u32 | str |
468 /// +=====+==========+
469 /// | 0 | James |
470 /// +-----+----------+
471 /// | 1 | Mary |
472 /// +-----+----------+
473 /// | 2 | John |
474 /// +-----+----------+
475 /// | 3 | Patricia |
476 /// +-----+----------+
477 /// ```
478 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
479 let mut columns = Vec::with_capacity(self.columns.len() + 1);
480 let offset = offset.unwrap_or(0);
481
482 let col = Column::new_row_index(name, offset, self.height())?;
483 columns.push(col);
484 columns.extend_from_slice(&self.columns);
485 DataFrame::new(columns)
486 }
487
488 /// Add a row index column in place.
489 ///
490 /// # Safety
491 /// The caller should ensure the DataFrame does not already contain a column with the given name.
492 ///
493 /// # Panics
494 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
495 pub unsafe fn with_row_index_mut(
496 &mut self,
497 name: PlSmallStr,
498 offset: Option<IdxSize>,
499 ) -> &mut Self {
500 // TODO: Make this function unsafe
501 debug_assert!(
502 self.columns.iter().all(|c| c.name() != &name),
503 "with_row_index_mut(): column with name {} already exists",
504 &name
505 );
506
507 let offset = offset.unwrap_or(0);
508 let col = Column::new_row_index(name, offset, self.height()).unwrap();
509
510 self.clear_schema();
511 self.columns.insert(0, col);
512 self
513 }
514
515 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
516 /// `Series`.
517 ///
518 /// Calculates the height from the first column or `0` if no columns are given.
519 ///
520 /// # Safety
521 ///
522 /// It is the callers responsibility to uphold the contract of all `Series`
523 /// having an equal length and a unique name, if not this may panic down the line.
524 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
525 let height = columns.first().map_or(0, Column::len);
526 unsafe { Self::new_no_checks(height, columns) }
527 }
528
529 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
530 /// `Series`.
531 ///
532 /// It is advised to use [DataFrame::new] in favor of this method.
533 ///
534 /// # Safety
535 ///
536 /// It is the callers responsibility to uphold the contract of all `Series`
537 /// having an equal length and a unique name, if not this may panic down the line.
538 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
539 if cfg!(debug_assertions) {
540 DataFrame::validate_columns_slice(&columns).unwrap();
541 }
542
543 unsafe { Self::_new_no_checks_impl(height, columns) }
544 }
545
546 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
547 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
548 /// constructed with this method is generally highly unsafe and should not be long-lived.
549 #[allow(clippy::missing_safety_doc)]
550 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
551 DataFrame {
552 height,
553 columns,
554 cached_schema: OnceLock::new(),
555 }
556 }
557
558 /// Shrink the capacity of this DataFrame to fit its length.
559 pub fn shrink_to_fit(&mut self) {
560 // Don't parallelize this. Memory overhead
561 for s in &mut self.columns {
562 s.shrink_to_fit();
563 }
564 }
565
566 /// Aggregate all the chunks in the DataFrame to a single chunk.
567 pub fn as_single_chunk(&mut self) -> &mut Self {
568 // Don't parallelize this. Memory overhead
569 for s in &mut self.columns {
570 *s = s.rechunk();
571 }
572 self
573 }
574
575 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
576 /// This may lead to more peak memory consumption.
577 pub fn as_single_chunk_par(&mut self) -> &mut Self {
578 if self.columns.iter().any(|c| c.n_chunks() > 1) {
579 self.columns = self._apply_columns_par(&|s| s.rechunk());
580 }
581 self
582 }
583
584 /// Rechunks all columns to only have a single chunk.
585 pub fn rechunk_mut(&mut self) {
586 // SAFETY: We never adjust the length or names of the columns.
587 let columns = unsafe { self.get_columns_mut() };
588
589 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
590 *col = col.rechunk();
591 }
592 }
593
594 pub fn _deshare_views_mut(&mut self) {
595 // SAFETY: We never adjust the length or names of the columns.
596 unsafe {
597 let columns = self.get_columns_mut();
598 for col in columns {
599 let Column::Series(s) = col else { continue };
600
601 if let Ok(ca) = s.binary() {
602 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
603 *col = Column::from(gc_ca.into_series());
604 } else if let Ok(ca) = s.str() {
605 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
606 *col = Column::from(gc_ca.into_series());
607 }
608 }
609 }
610 }
611
612 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
613 pub fn rechunk_to_record_batch(
614 self,
615 compat_level: CompatLevel,
616 ) -> RecordBatchT<Box<dyn Array>> {
617 let height = self.height();
618
619 let (schema, arrays) = self
620 .columns
621 .into_iter()
622 .map(|col| {
623 let mut series = col.take_materialized_series();
624 // Rechunk to one chunk if necessary
625 if series.n_chunks() > 1 {
626 series = series.rechunk();
627 }
628 (
629 series.field().to_arrow(compat_level),
630 series.to_arrow(0, compat_level),
631 )
632 })
633 .collect();
634
635 RecordBatchT::new(height, Arc::new(schema), arrays)
636 }
637
638 /// Returns true if the chunks of the columns do not align and re-chunking should be done
639 pub fn should_rechunk(&self) -> bool {
640 // Fast check. It is also needed for correctness, as code below doesn't check if the number
641 // of chunks is equal.
642 if !self
643 .get_columns()
644 .iter()
645 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
646 .all_equal()
647 {
648 return true;
649 }
650
651 // From here we check chunk lengths.
652 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
653 match chunk_lengths.next() {
654 None => false,
655 Some(first_column_chunk_lengths) => {
656 // Fast Path for single Chunk Series
657 if first_column_chunk_lengths.size_hint().0 == 1 {
658 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
659 }
660 // Always rechunk if we have more chunks than rows.
661 // except when we have an empty df containing a single chunk
662 let height = self.height();
663 let n_chunks = first_column_chunk_lengths.size_hint().0;
664 if n_chunks > height && !(height == 0 && n_chunks == 1) {
665 return true;
666 }
667 // Slow Path for multi Chunk series
668 let v: Vec<_> = first_column_chunk_lengths.collect();
669 for cl in chunk_lengths {
670 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
671 return true;
672 }
673 }
674 false
675 },
676 }
677 }
678
679 /// Ensure all the chunks in the [`DataFrame`] are aligned.
680 pub fn align_chunks_par(&mut self) -> &mut Self {
681 if self.should_rechunk() {
682 self.as_single_chunk_par()
683 } else {
684 self
685 }
686 }
687
688 pub fn align_chunks(&mut self) -> &mut Self {
689 if self.should_rechunk() {
690 self.as_single_chunk()
691 } else {
692 self
693 }
694 }
695
696 /// Get the [`DataFrame`] schema.
697 ///
698 /// # Example
699 ///
700 /// ```rust
701 /// # use polars_core::prelude::*;
702 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
703 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
704 ///
705 /// let f1: Field = Field::new("Thing".into(), DataType::String);
706 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
707 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
708 ///
709 /// assert_eq!(&**df.schema(), &sc);
710 /// # Ok::<(), PolarsError>(())
711 /// ```
712 pub fn schema(&self) -> &SchemaRef {
713 let out = self.cached_schema.get_or_init(|| {
714 Arc::new(
715 self.columns
716 .iter()
717 .map(|x| (x.name().clone(), x.dtype().clone()))
718 .collect(),
719 )
720 });
721
722 debug_assert_eq!(out.len(), self.width());
723
724 out
725 }
726
727 /// Get a reference to the [`DataFrame`] columns.
728 ///
729 /// # Example
730 ///
731 /// ```rust
732 /// # use polars_core::prelude::*;
733 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
734 /// "Symbol" => ["A", "C", "G", "T"])?;
735 /// let columns: &[Column] = df.get_columns();
736 ///
737 /// assert_eq!(columns[0].name(), "Name");
738 /// assert_eq!(columns[1].name(), "Symbol");
739 /// # Ok::<(), PolarsError>(())
740 /// ```
741 #[inline]
742 pub fn get_columns(&self) -> &[Column] {
743 &self.columns
744 }
745
746 #[inline]
747 /// Get mutable access to the underlying columns.
748 ///
749 /// # Safety
750 ///
751 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
752 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
753 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
754 /// calling [`DataFrame::clear_schema`].
755 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
756 &mut self.columns
757 }
758
759 #[inline]
760 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
761 pub fn clear_columns(&mut self) {
762 unsafe { self.get_columns_mut() }.clear();
763 self.clear_schema();
764 }
765
766 #[inline]
767 /// Extend the columns without checking for name collisions or height.
768 ///
769 /// # Safety
770 ///
771 /// The caller needs to ensure that:
772 /// - Column names are unique within the resulting [`DataFrame`].
773 /// - The length of each appended column matches the height of the [`DataFrame`]. For
774 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
775 /// with [`DataFrame::set_height`].
776 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
777 unsafe { self.get_columns_mut() }.extend(iter);
778 self.clear_schema();
779 }
780
781 /// Take ownership of the underlying columns vec.
782 pub fn take_columns(self) -> Vec<Column> {
783 self.columns
784 }
785
786 /// Iterator over the columns as [`Series`].
787 ///
788 /// # Example
789 ///
790 /// ```rust
791 /// # use polars_core::prelude::*;
792 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
793 /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
794 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
795 ///
796 /// let mut iterator = df.iter();
797 ///
798 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
799 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
800 /// assert_eq!(iterator.next(), None);
801 /// # Ok::<(), PolarsError>(())
802 /// ```
803 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
804 self.materialized_column_iter()
805 }
806
807 /// # Example
808 ///
809 /// ```rust
810 /// # use polars_core::prelude::*;
811 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
812 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
813 ///
814 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
815 /// # Ok::<(), PolarsError>(())
816 /// ```
817 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
818 self.columns.iter().map(|s| s.name()).collect()
819 }
820
821 /// Get the [`Vec<PlSmallStr>`] representing the column names.
822 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
823 self.columns.iter().map(|s| s.name().clone()).collect()
824 }
825
826 pub fn get_column_names_str(&self) -> Vec<&str> {
827 self.columns.iter().map(|s| s.name().as_str()).collect()
828 }
829
830 /// Set the column names.
831 /// # Example
832 ///
833 /// ```rust
834 /// # use polars_core::prelude::*;
835 /// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
836 /// df.set_column_names(["Set"])?;
837 ///
838 /// assert_eq!(df.get_column_names(), &["Set"]);
839 /// # Ok::<(), PolarsError>(())
840 /// ```
841 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
842 where
843 I: IntoIterator<Item = S>,
844 S: Into<PlSmallStr>,
845 {
846 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
847 self._set_column_names_impl(names.as_slice())
848 }
849
850 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
851 polars_ensure!(
852 names.len() == self.width(),
853 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
854 names.len(), self.width()
855 );
856 ensure_names_unique(names, |s| s.as_str())?;
857
858 let columns = mem::take(&mut self.columns);
859 self.columns = columns
860 .into_iter()
861 .zip(names)
862 .map(|(s, name)| {
863 let mut s = s;
864 s.rename(name.clone());
865 s
866 })
867 .collect();
868 self.clear_schema();
869 Ok(())
870 }
871
872 /// Get the data types of the columns in the [`DataFrame`].
873 ///
874 /// # Example
875 ///
876 /// ```rust
877 /// # use polars_core::prelude::*;
878 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
879 /// "Fraction" => [0.965, 0.035])?;
880 ///
881 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
882 /// # Ok::<(), PolarsError>(())
883 /// ```
884 pub fn dtypes(&self) -> Vec<DataType> {
885 self.columns.iter().map(|s| s.dtype().clone()).collect()
886 }
887
888 pub(crate) fn first_series_column(&self) -> Option<&Series> {
889 self.columns.iter().find_map(|col| col.as_series())
890 }
891
892 /// The number of chunks for the first column.
893 pub fn first_col_n_chunks(&self) -> usize {
894 match self.first_series_column() {
895 None if self.columns.is_empty() => 0,
896 None => 1,
897 Some(s) => s.n_chunks(),
898 }
899 }
900
901 /// The highest number of chunks for any column.
902 pub fn max_n_chunks(&self) -> usize {
903 self.columns
904 .iter()
905 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
906 .max()
907 .unwrap_or(0)
908 }
909
910 /// Get a reference to the schema fields of the [`DataFrame`].
911 ///
912 /// # Example
913 ///
914 /// ```rust
915 /// # use polars_core::prelude::*;
916 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
917 /// "Fraction" => [0.708, 0.292])?;
918 ///
919 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
920 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
921 ///
922 /// assert_eq!(earth.fields(), &[f1, f2]);
923 /// # Ok::<(), PolarsError>(())
924 /// ```
925 pub fn fields(&self) -> Vec<Field> {
926 self.columns
927 .iter()
928 .map(|s| s.field().into_owned())
929 .collect()
930 }
931
932 /// Get (height, width) of the [`DataFrame`].
933 ///
934 /// # Example
935 ///
936 /// ```rust
937 /// # use polars_core::prelude::*;
938 /// let df0: DataFrame = DataFrame::default();
939 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
940 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
941 /// "2" => [1, 2, 3, 4, 5])?;
942 ///
943 /// assert_eq!(df0.shape(), (0 ,0));
944 /// assert_eq!(df1.shape(), (5, 1));
945 /// assert_eq!(df2.shape(), (5, 2));
946 /// # Ok::<(), PolarsError>(())
947 /// ```
948 pub fn shape(&self) -> (usize, usize) {
949 (self.height, self.columns.len())
950 }
951
952 /// Get the width of the [`DataFrame`] which is the number of columns.
953 ///
954 /// # Example
955 ///
956 /// ```rust
957 /// # use polars_core::prelude::*;
958 /// let df0: DataFrame = DataFrame::default();
959 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
960 /// let df2: DataFrame = df!("Series 1" => [0; 0],
961 /// "Series 2" => [0; 0])?;
962 ///
963 /// assert_eq!(df0.width(), 0);
964 /// assert_eq!(df1.width(), 1);
965 /// assert_eq!(df2.width(), 2);
966 /// # Ok::<(), PolarsError>(())
967 /// ```
968 pub fn width(&self) -> usize {
969 self.columns.len()
970 }
971
972 /// Get the height of the [`DataFrame`] which is the number of rows.
973 ///
974 /// # Example
975 ///
976 /// ```rust
977 /// # use polars_core::prelude::*;
978 /// let df0: DataFrame = DataFrame::default();
979 /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
980 /// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;
981 ///
982 /// assert_eq!(df0.height(), 0);
983 /// assert_eq!(df1.height(), 2);
984 /// assert_eq!(df2.height(), 5);
985 /// # Ok::<(), PolarsError>(())
986 /// ```
987 pub fn height(&self) -> usize {
988 self.height
989 }
990
991 /// Returns the size as number of rows * number of columns
992 pub fn size(&self) -> usize {
993 let s = self.shape();
994 s.0 * s.1
995 }
996
997 /// Returns `true` if the [`DataFrame`] contains no rows.
998 ///
999 /// # Example
1000 ///
1001 /// ```rust
1002 /// # use polars_core::prelude::*;
1003 /// let df1: DataFrame = DataFrame::default();
1004 /// assert!(df1.is_empty());
1005 ///
1006 /// let df2: DataFrame = df!("First name" => ["Forever"],
1007 /// "Last name" => ["Alone"])?;
1008 /// assert!(!df2.is_empty());
1009 /// # Ok::<(), PolarsError>(())
1010 /// ```
1011 pub fn is_empty(&self) -> bool {
1012 matches!(self.shape(), (0, _) | (_, 0))
1013 }
1014
1015 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1016 ///
1017 /// # Safety
1018 ///
1019 /// This needs to be equal to the length of all the columns.
1020 pub unsafe fn set_height(&mut self, height: usize) {
1021 self.height = height;
1022 }
1023
1024 /// Add multiple [`Series`] to a [`DataFrame`].
1025 /// The added `Series` are required to have the same length.
1026 ///
1027 /// # Example
1028 ///
1029 /// ```rust
1030 /// # use polars_core::prelude::*;
1031 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1032 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1033 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1034 ///
1035 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1036 /// assert_eq!(df2.shape(), (3, 3));
1037 /// println!("{}", df2);
1038 /// # Ok::<(), PolarsError>(())
1039 /// ```
1040 ///
1041 /// Output:
1042 ///
1043 /// ```text
1044 /// shape: (3, 3)
1045 /// +---------+--------+----------+
1046 /// | Element | Proton | Electron |
1047 /// | --- | --- | --- |
1048 /// | str | i32 | i32 |
1049 /// +=========+========+==========+
1050 /// | Copper | 29 | 29 |
1051 /// +---------+--------+----------+
1052 /// | Silver | 47 | 47 |
1053 /// +---------+--------+----------+
1054 /// | Gold | 79 | 79 |
1055 /// +---------+--------+----------+
1056 /// ```
1057 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1058 let mut new_cols = self.columns.clone();
1059 new_cols.extend_from_slice(columns);
1060 DataFrame::new(new_cols)
1061 }
1062
1063 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1064 ///
1065 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1066 ///
1067 /// # Example
1068 ///
1069 /// ```rust
1070 /// # use polars_core::prelude::*;
1071 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1072 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1073 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1074 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1075 ///
1076 /// let df3: DataFrame = df1.vstack(&df2)?;
1077 ///
1078 /// assert_eq!(df3.shape(), (5, 2));
1079 /// println!("{}", df3);
1080 /// # Ok::<(), PolarsError>(())
1081 /// ```
1082 ///
1083 /// Output:
1084 ///
1085 /// ```text
1086 /// shape: (5, 2)
1087 /// +-----------+-------------------+
1088 /// | Element | Melting Point (K) |
1089 /// | --- | --- |
1090 /// | str | f64 |
1091 /// +===========+===================+
1092 /// | Copper | 1357.77 |
1093 /// +-----------+-------------------+
1094 /// | Silver | 1234.93 |
1095 /// +-----------+-------------------+
1096 /// | Gold | 1337.33 |
1097 /// +-----------+-------------------+
1098 /// | Platinum | 2041.4 |
1099 /// +-----------+-------------------+
1100 /// | Palladium | 1828.05 |
1101 /// +-----------+-------------------+
1102 /// ```
1103 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1104 let mut df = self.clone();
1105 df.vstack_mut(other)?;
1106 Ok(df)
1107 }
1108
1109 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1110 ///
1111 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1112 ///
1113 /// # Example
1114 ///
1115 /// ```rust
1116 /// # use polars_core::prelude::*;
1117 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1118 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1119 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1120 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1121 ///
1122 /// df1.vstack_mut(&df2)?;
1123 ///
1124 /// assert_eq!(df1.shape(), (5, 2));
1125 /// println!("{}", df1);
1126 /// # Ok::<(), PolarsError>(())
1127 /// ```
1128 ///
1129 /// Output:
1130 ///
1131 /// ```text
1132 /// shape: (5, 2)
1133 /// +-----------+-------------------+
1134 /// | Element | Melting Point (K) |
1135 /// | --- | --- |
1136 /// | str | f64 |
1137 /// +===========+===================+
1138 /// | Copper | 1357.77 |
1139 /// +-----------+-------------------+
1140 /// | Silver | 1234.93 |
1141 /// +-----------+-------------------+
1142 /// | Gold | 1337.33 |
1143 /// +-----------+-------------------+
1144 /// | Platinum | 2041.4 |
1145 /// +-----------+-------------------+
1146 /// | Palladium | 1828.05 |
1147 /// +-----------+-------------------+
1148 /// ```
1149 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1150 if self.width() != other.width() {
1151 polars_ensure!(
1152 self.width() == 0,
1153 ShapeMismatch:
1154 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1155 self.width(), other.width(),
1156 );
1157 self.columns.clone_from(&other.columns);
1158 self.height = other.height;
1159 return Ok(self);
1160 }
1161
1162 self.columns
1163 .iter_mut()
1164 .zip(other.columns.iter())
1165 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1166 ensure_can_extend(&*left, right)?;
1167 left.append(right).map_err(|e| {
1168 e.context(format!("failed to vstack column '{}'", right.name()).into())
1169 })?;
1170 Ok(())
1171 })?;
1172 self.height += other.height;
1173 Ok(self)
1174 }
1175
1176 pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1177 if self.width() != other.width() {
1178 polars_ensure!(
1179 self.width() == 0,
1180 ShapeMismatch:
1181 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1182 self.width(), other.width(),
1183 );
1184 self.columns = other.columns;
1185 self.height = other.height;
1186 return Ok(self);
1187 }
1188
1189 self.columns
1190 .iter_mut()
1191 .zip(other.columns.into_iter())
1192 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1193 ensure_can_extend(&*left, &right)?;
1194 let right_name = right.name().clone();
1195 left.append_owned(right).map_err(|e| {
1196 e.context(format!("failed to vstack column '{right_name}'").into())
1197 })?;
1198 Ok(())
1199 })?;
1200 self.height += other.height;
1201 Ok(self)
1202 }
1203
1204 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1205 ///
1206 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1207 ///
1208 /// # Panics
1209 /// Panics if the schema's don't match.
1210 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1211 self.columns
1212 .iter_mut()
1213 .zip(other.columns.iter())
1214 .for_each(|(left, right)| {
1215 left.append(right)
1216 .map_err(|e| {
1217 e.context(format!("failed to vstack column '{}'", right.name()).into())
1218 })
1219 .expect("should not fail");
1220 });
1221 self.height += other.height;
1222 }
1223
1224 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1225 ///
1226 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1227 ///
1228 /// # Panics
1229 /// Panics if the schema's don't match.
1230 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1231 self.columns
1232 .iter_mut()
1233 .zip(other.columns)
1234 .for_each(|(left, right)| {
1235 left.append_owned(right).expect("should not fail");
1236 });
1237 self.height += other.height;
1238 }
1239
1240 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1241 ///
1242 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1243 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1244 ///
1245 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1246 /// and thus will yield faster queries.
1247 ///
1248 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1249 /// online operations where you add `n` rows and rerun a query.
1250 ///
1251 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1252 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1253 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1254 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1255 polars_ensure!(
1256 self.width() == other.width(),
1257 ShapeMismatch:
1258 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1259 self.width(), other.width(),
1260 );
1261
1262 self.columns
1263 .iter_mut()
1264 .zip(other.columns.iter())
1265 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1266 ensure_can_extend(&*left, right)?;
1267 left.extend(right).map_err(|e| {
1268 e.context(format!("failed to extend column '{}'", right.name()).into())
1269 })?;
1270 Ok(())
1271 })?;
1272 self.height += other.height;
1273 self.clear_schema();
1274 Ok(())
1275 }
1276
1277 /// Remove a column by name and return the column removed.
1278 ///
1279 /// # Example
1280 ///
1281 /// ```rust
1282 /// # use polars_core::prelude::*;
1283 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1284 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1285 ///
1286 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1287 /// assert!(s1.is_err());
1288 ///
1289 /// let s2: Column = df.drop_in_place("Animal")?;
1290 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1291 /// # Ok::<(), PolarsError>(())
1292 /// ```
1293 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1294 let idx = self.check_name_to_idx(name)?;
1295 self.clear_schema();
1296 Ok(self.columns.remove(idx))
1297 }
1298
1299 /// Return a new [`DataFrame`] where all null values are dropped.
1300 ///
1301 /// # Example
1302 ///
1303 /// ```no_run
1304 /// # use polars_core::prelude::*;
1305 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1306 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1307 /// assert_eq!(df1.shape(), (3, 2));
1308 ///
1309 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1310 /// assert_eq!(df2.shape(), (1, 2));
1311 /// println!("{}", df2);
1312 /// # Ok::<(), PolarsError>(())
1313 /// ```
1314 ///
1315 /// Output:
1316 ///
1317 /// ```text
1318 /// shape: (1, 2)
1319 /// +---------+---------------------+
1320 /// | Country | Tax revenue (% GDP) |
1321 /// | --- | --- |
1322 /// | str | f64 |
1323 /// +=========+=====================+
1324 /// | Malta | 32.7 |
1325 /// +---------+---------------------+
1326 /// ```
1327 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1328 where
1329 for<'a> &'a S: Into<PlSmallStr>,
1330 {
1331 if let Some(v) = subset {
1332 let v = self.select_columns(v)?;
1333 self._drop_nulls_impl(v.as_slice())
1334 } else {
1335 self._drop_nulls_impl(self.columns.as_slice())
1336 }
1337 }
1338
1339 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1340 // fast path for no nulls in df
1341 if subset.iter().all(|s| !s.has_nulls()) {
1342 return Ok(self.clone());
1343 }
1344
1345 let mut iter = subset.iter();
1346
1347 let mask = iter
1348 .next()
1349 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1350 let mut mask = mask.is_not_null();
1351
1352 for c in iter {
1353 mask = mask & c.is_not_null();
1354 }
1355 self.filter(&mask)
1356 }
1357
1358 /// Drop a column by name.
1359 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1360 /// the current one in place.
1361 ///
1362 /// # Example
1363 ///
1364 /// ```rust
1365 /// # use polars_core::prelude::*;
1366 /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1367 /// let df2: DataFrame = df1.drop("Ray type")?;
1368 ///
1369 /// assert!(df2.is_empty());
1370 /// # Ok::<(), PolarsError>(())
1371 /// ```
1372 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1373 let idx = self.check_name_to_idx(name)?;
1374 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1375
1376 self.columns.iter().enumerate().for_each(|(i, s)| {
1377 if i != idx {
1378 new_cols.push(s.clone())
1379 }
1380 });
1381
1382 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1383 }
1384
1385 /// Drop columns that are in `names`.
1386 pub fn drop_many<I, S>(&self, names: I) -> Self
1387 where
1388 I: IntoIterator<Item = S>,
1389 S: Into<PlSmallStr>,
1390 {
1391 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1392 self.drop_many_amortized(&names)
1393 }
1394
1395 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1396 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1397 if names.is_empty() {
1398 return self.clone();
1399 }
1400 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1401 self.columns.iter().for_each(|s| {
1402 if !names.contains(s.name()) {
1403 new_cols.push(s.clone())
1404 }
1405 });
1406
1407 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1408 }
1409
1410 /// Insert a new column at a given index without checking for duplicates.
1411 /// This can leave the [`DataFrame`] at an invalid state
1412 fn insert_column_no_name_check(
1413 &mut self,
1414 index: usize,
1415 column: Column,
1416 ) -> PolarsResult<&mut Self> {
1417 polars_ensure!(
1418 self.width() == 0 || column.len() == self.height(),
1419 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1420 column.len(), self.height(),
1421 );
1422
1423 if self.width() == 0 {
1424 self.height = column.len();
1425 }
1426
1427 self.columns.insert(index, column);
1428 self.clear_schema();
1429 Ok(self)
1430 }
1431
1432 /// Insert a new column at a given index.
1433 pub fn insert_column<S: IntoColumn>(
1434 &mut self,
1435 index: usize,
1436 column: S,
1437 ) -> PolarsResult<&mut Self> {
1438 let column = column.into_column();
1439 self.check_already_present(column.name().as_str())?;
1440 self.insert_column_no_name_check(index, column)
1441 }
1442
1443 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1444 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1445 self.replace_column(idx, column)?;
1446 } else {
1447 if self.width() == 0 {
1448 self.height = column.len();
1449 }
1450
1451 self.columns.push(column);
1452 self.clear_schema();
1453 }
1454 Ok(())
1455 }
1456
1457 /// Add a new column to this [`DataFrame`] or replace an existing one.
1458 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1459 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1460 let height = df.height();
1461 if column.len() == 1 && height > 1 {
1462 column = column.new_from_index(0, height);
1463 }
1464
1465 if column.len() == height || df.get_columns().is_empty() {
1466 df.add_column_by_search(column)?;
1467 Ok(df)
1468 }
1469 // special case for literals
1470 else if height == 0 && column.len() == 1 {
1471 let s = column.clear();
1472 df.add_column_by_search(s)?;
1473 Ok(df)
1474 } else {
1475 polars_bail!(
1476 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1477 column.len(), height,
1478 );
1479 }
1480 }
1481 let column = column.into_column();
1482 inner(self, column)
1483 }
1484
1485 /// Adds a column to the [`DataFrame`] without doing any checks
1486 /// on length or duplicates.
1487 ///
1488 /// # Safety
1489 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1490 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1491 debug_assert!(self.width() == 0 || self.height() == column.len());
1492 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1493
1494 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1495 // properly for `width` == 0.
1496 if self.width() == 0 {
1497 unsafe { self.set_height(column.len()) };
1498 }
1499 unsafe { self.get_columns_mut() }.push(column);
1500 self.clear_schema();
1501
1502 self
1503 }
1504
1505 // Note: Schema can be both input or output_schema
1506 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1507 let name = c.name();
1508 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1509 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1510 // Given schema is output_schema and we can push.
1511 if idx == self.columns.len() {
1512 if self.width() == 0 {
1513 self.height = c.len();
1514 }
1515
1516 self.columns.push(c);
1517 self.clear_schema();
1518 }
1519 // Schema is incorrect fallback to search
1520 else {
1521 debug_assert!(false);
1522 self.add_column_by_search(c)?;
1523 }
1524 } else {
1525 self.replace_column(idx, c)?;
1526 }
1527 } else {
1528 if self.width() == 0 {
1529 self.height = c.len();
1530 }
1531
1532 self.columns.push(c);
1533 self.clear_schema();
1534 }
1535
1536 Ok(())
1537 }
1538
1539 // Note: Schema can be both input or output_schema
1540 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1541 for (i, s) in series.into_iter().enumerate() {
1542 // we need to branch here
1543 // because users can add multiple columns with the same name
1544 if i == 0 || schema.get(s.name().as_str()).is_some() {
1545 self.with_column_and_schema(s.into_column(), schema)?;
1546 } else {
1547 self.with_column(s.clone().into_column())?;
1548 }
1549 }
1550 Ok(())
1551 }
1552
1553 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1554 for (i, s) in columns.into_iter().enumerate() {
1555 // we need to branch here
1556 // because users can add multiple columns with the same name
1557 if i == 0 || schema.get(s.name().as_str()).is_some() {
1558 self.with_column_and_schema(s, schema)?;
1559 } else {
1560 self.with_column(s.clone())?;
1561 }
1562 }
1563
1564 Ok(())
1565 }
1566
1567 /// Add a new column to this [`DataFrame`] or replace an existing one.
1568 /// Uses an existing schema to amortize lookups.
1569 /// If the schema is incorrect, we will fallback to linear search.
1570 ///
1571 /// Note: Schema can be both input or output_schema
1572 pub fn with_column_and_schema<C: IntoColumn>(
1573 &mut self,
1574 column: C,
1575 schema: &Schema,
1576 ) -> PolarsResult<&mut Self> {
1577 let mut column = column.into_column();
1578
1579 let height = self.height();
1580 if column.len() == 1 && height > 1 {
1581 column = column.new_from_index(0, height);
1582 }
1583
1584 if column.len() == height || self.columns.is_empty() {
1585 self.add_column_by_schema(column, schema)?;
1586 Ok(self)
1587 }
1588 // special case for literals
1589 else if height == 0 && column.len() == 1 {
1590 let s = column.clear();
1591 self.add_column_by_schema(s, schema)?;
1592 Ok(self)
1593 } else {
1594 polars_bail!(
1595 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1596 column.len(), height,
1597 );
1598 }
1599 }
1600
1601 /// Get a row in the [`DataFrame`]. Beware this is slow.
1602 ///
1603 /// # Example
1604 ///
1605 /// ```
1606 /// # use polars_core::prelude::*;
1607 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1608 /// df.get(idx)
1609 /// }
1610 /// ```
1611 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1612 match self.columns.first() {
1613 Some(s) => {
1614 if s.len() <= idx {
1615 return None;
1616 }
1617 },
1618 None => return None,
1619 }
1620 // SAFETY: we just checked bounds
1621 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1622 }
1623
1624 /// Select a [`Series`] by index.
1625 ///
1626 /// # Example
1627 ///
1628 /// ```rust
1629 /// # use polars_core::prelude::*;
1630 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1631 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1632 ///
1633 /// let s1: Option<&Column> = df.select_at_idx(0);
1634 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1635 ///
1636 /// assert_eq!(s1, Some(&s2));
1637 /// # Ok::<(), PolarsError>(())
1638 /// ```
1639 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1640 self.columns.get(idx)
1641 }
1642
1643 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1644 ///
1645 /// # Examples
1646 ///
1647 /// ```rust
1648 /// # use polars_core::prelude::*;
1649 /// let df = df! {
1650 /// "0" => [0, 0, 0],
1651 /// "1" => [1, 1, 1],
1652 /// "2" => [2, 2, 2]
1653 /// }?;
1654 ///
1655 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1656 /// assert!(df.equals(&df.select_by_range(..)?));
1657 /// # Ok::<(), PolarsError>(())
1658 /// ```
1659 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1660 where
1661 R: ops::RangeBounds<usize>,
1662 {
1663 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1664 // because it is the nightly feature. We should change here if this function were stable.
1665 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1666 where
1667 R: ops::RangeBounds<usize>,
1668 {
1669 let len = bounds.end;
1670
1671 let start: ops::Bound<&usize> = range.start_bound();
1672 let start = match start {
1673 ops::Bound::Included(&start) => start,
1674 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1675 panic!("attempted to index slice from after maximum usize");
1676 }),
1677 ops::Bound::Unbounded => 0,
1678 };
1679
1680 let end: ops::Bound<&usize> = range.end_bound();
1681 let end = match end {
1682 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1683 panic!("attempted to index slice up to maximum usize");
1684 }),
1685 ops::Bound::Excluded(&end) => end,
1686 ops::Bound::Unbounded => len,
1687 };
1688
1689 if start > end {
1690 panic!("slice index starts at {start} but ends at {end}");
1691 }
1692 if end > len {
1693 panic!("range end index {end} out of range for slice of length {len}",);
1694 }
1695
1696 ops::Range { start, end }
1697 }
1698
1699 let colnames = self.get_column_names_owned();
1700 let range = get_range(range, ..colnames.len());
1701
1702 self._select_impl(&colnames[range])
1703 }
1704
1705 /// Get column index of a [`Series`] by name.
1706 /// # Example
1707 ///
1708 /// ```rust
1709 /// # use polars_core::prelude::*;
1710 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1711 /// "Health" => [100, 200, 500],
1712 /// "Mana" => [250, 100, 0],
1713 /// "Strength" => [30, 150, 300])?;
1714 ///
1715 /// assert_eq!(df.get_column_index("Name"), Some(0));
1716 /// assert_eq!(df.get_column_index("Health"), Some(1));
1717 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1718 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1719 /// assert_eq!(df.get_column_index("Haste"), None);
1720 /// # Ok::<(), PolarsError>(())
1721 /// ```
1722 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1723 let schema = self.schema();
1724 if let Some(idx) = schema.index_of(name) {
1725 if self
1726 .get_columns()
1727 .get(idx)
1728 .is_some_and(|c| c.name() == name)
1729 {
1730 return Some(idx);
1731 }
1732 }
1733
1734 self.columns.iter().position(|s| s.name().as_str() == name)
1735 }
1736
1737 /// Get column index of a [`Series`] by name.
1738 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1739 self.get_column_index(name)
1740 .ok_or_else(|| polars_err!(col_not_found = name))
1741 }
1742
1743 /// Select a single column by name.
1744 ///
1745 /// # Example
1746 ///
1747 /// ```rust
1748 /// # use polars_core::prelude::*;
1749 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1750 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1751 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1752 ///
1753 /// assert_eq!(df.column("Password")?, &s1);
1754 /// # Ok::<(), PolarsError>(())
1755 /// ```
1756 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1757 let idx = self.try_get_column_index(name)?;
1758 Ok(self.select_at_idx(idx).unwrap())
1759 }
1760
1761 /// Selected multiple columns by name.
1762 ///
1763 /// # Example
1764 ///
1765 /// ```rust
1766 /// # use polars_core::prelude::*;
1767 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1768 /// "Max weight (kg)" => [16.0, 35.89])?;
1769 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1770 ///
1771 /// assert_eq!(&df[0], sv[0]);
1772 /// assert_eq!(&df[1], sv[1]);
1773 /// # Ok::<(), PolarsError>(())
1774 /// ```
1775 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1776 where
1777 I: IntoIterator<Item = S>,
1778 S: AsRef<str>,
1779 {
1780 names
1781 .into_iter()
1782 .map(|name| self.column(name.as_ref()))
1783 .collect()
1784 }
1785
1786 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1787 ///
1788 /// # Examples
1789 ///
1790 /// ```
1791 /// # use polars_core::prelude::*;
1792 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1793 /// df.select(["foo", "bar"])
1794 /// }
1795 /// ```
1796 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1797 where
1798 I: IntoIterator<Item = S>,
1799 S: Into<PlSmallStr>,
1800 {
1801 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1802 self._select_impl(cols.as_slice())
1803 }
1804
1805 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1806 ensure_names_unique(cols, |s| s.as_str())?;
1807 self._select_impl_unchecked(cols)
1808 }
1809
1810 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1811 let selected = self.select_columns_impl(cols)?;
1812 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1813 }
1814
1815 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1816 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1817 where
1818 I: IntoIterator<Item = S>,
1819 S: Into<PlSmallStr>,
1820 {
1821 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1822 self._select_with_schema_impl(&cols, schema, true)
1823 }
1824
1825 /// Select with a known schema without checking for duplicates in `selection`.
1826 /// The schema names must match the column names of this DataFrame.
1827 pub fn select_with_schema_unchecked<I, S>(
1828 &self,
1829 selection: I,
1830 schema: &Schema,
1831 ) -> PolarsResult<Self>
1832 where
1833 I: IntoIterator<Item = S>,
1834 S: Into<PlSmallStr>,
1835 {
1836 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1837 self._select_with_schema_impl(&cols, schema, false)
1838 }
1839
1840 /// * The schema names must match the column names of this DataFrame.
1841 pub fn _select_with_schema_impl(
1842 &self,
1843 cols: &[PlSmallStr],
1844 schema: &Schema,
1845 check_duplicates: bool,
1846 ) -> PolarsResult<Self> {
1847 if check_duplicates {
1848 ensure_names_unique(cols, |s| s.as_str())?;
1849 }
1850
1851 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1852 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1853 }
1854
1855 /// A non generic implementation to reduce compiler bloat.
1856 fn select_columns_impl_with_schema(
1857 &self,
1858 cols: &[PlSmallStr],
1859 schema: &Schema,
1860 ) -> PolarsResult<Vec<Column>> {
1861 if cfg!(debug_assertions) {
1862 ensure_matching_schema_names(schema, self.schema())?;
1863 }
1864
1865 cols.iter()
1866 .map(|name| {
1867 let index = schema.try_get_full(name.as_str())?.0;
1868 Ok(self.columns[index].clone())
1869 })
1870 .collect()
1871 }
1872
1873 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1874 where
1875 I: IntoIterator<Item = S>,
1876 S: Into<PlSmallStr>,
1877 {
1878 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1879 self.select_physical_impl(&cols)
1880 }
1881
1882 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1883 ensure_names_unique(cols, |s| s.as_str())?;
1884 let selected = self.select_columns_physical_impl(cols)?;
1885 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1886 }
1887
1888 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1889 ///
1890 /// # Example
1891 ///
1892 /// ```rust
1893 /// # use polars_core::prelude::*;
1894 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1895 /// "Carbon" => [1, 2, 3],
1896 /// "Hydrogen" => [4, 6, 8])?;
1897 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1898 ///
1899 /// assert_eq!(df["Carbon"], sv[0]);
1900 /// assert_eq!(df["Hydrogen"], sv[1]);
1901 /// # Ok::<(), PolarsError>(())
1902 /// ```
1903 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1904 let cols = selection.into_vec();
1905 self.select_columns_impl(&cols)
1906 }
1907
1908 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1909 self.columns
1910 .iter()
1911 .enumerate()
1912 .map(|(i, s)| (s.name().as_str(), i))
1913 .collect()
1914 }
1915
1916 /// A non generic implementation to reduce compiler bloat.
1917 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1918 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1919 let name_to_idx = self._names_to_idx_map();
1920 cols.iter()
1921 .map(|name| {
1922 let idx = *name_to_idx
1923 .get(name.as_str())
1924 .ok_or_else(|| polars_err!(col_not_found = name))?;
1925 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1926 })
1927 .collect::<PolarsResult<Vec<_>>>()?
1928 } else {
1929 cols.iter()
1930 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1931 .collect::<PolarsResult<Vec<_>>>()?
1932 };
1933
1934 Ok(selected)
1935 }
1936
1937 /// A non generic implementation to reduce compiler bloat.
1938 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1939 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1940 // we hash, because there are user that having millions of columns.
1941 // # https://github.com/pola-rs/polars/issues/1023
1942 let name_to_idx = self._names_to_idx_map();
1943
1944 cols.iter()
1945 .map(|name| {
1946 let idx = *name_to_idx
1947 .get(name.as_str())
1948 .ok_or_else(|| polars_err!(col_not_found = name))?;
1949 Ok(self.select_at_idx(idx).unwrap().clone())
1950 })
1951 .collect::<PolarsResult<Vec<_>>>()?
1952 } else {
1953 cols.iter()
1954 .map(|c| self.column(c.as_str()).cloned())
1955 .collect::<PolarsResult<Vec<_>>>()?
1956 };
1957
1958 Ok(selected)
1959 }
1960
1961 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1962 // If there is a filtered column just see how many columns there are left.
1963 if let Some(fst) = filtered.first() {
1964 return fst.len();
1965 }
1966
1967 // Otherwise, count the number of values that would be filtered and return that height.
1968 let num_trues = mask.num_trues();
1969 if mask.len() == self.height() {
1970 num_trues
1971 } else {
1972 // This is for broadcasting masks
1973 debug_assert!(num_trues == 0 || num_trues == 1);
1974 self.height() * num_trues
1975 }
1976 }
1977
1978 /// Take the [`DataFrame`] rows by a boolean mask.
1979 ///
1980 /// # Example
1981 ///
1982 /// ```
1983 /// # use polars_core::prelude::*;
1984 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1985 /// let mask = df.column("sepal_width")?.is_not_null();
1986 /// df.filter(&mask)
1987 /// }
1988 /// ```
1989 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1990 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1991 let height = self.filter_height(&new_col, mask);
1992
1993 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1994 }
1995
1996 /// Same as `filter` but does not parallelize.
1997 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1998 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1999 let height = self.filter_height(&new_col, mask);
2000
2001 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2002 }
2003
2004 /// Take [`DataFrame`] rows by index values.
2005 ///
2006 /// # Example
2007 ///
2008 /// ```
2009 /// # use polars_core::prelude::*;
2010 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2011 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2012 /// df.take(&idx)
2013 /// }
2014 /// ```
2015 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2016 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2017
2018 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2019 }
2020
2021 /// # Safety
2022 /// The indices must be in-bounds.
2023 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2024 self.take_unchecked_impl(idx, true)
2025 }
2026
2027 /// # Safety
2028 /// The indices must be in-bounds.
2029 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2030 let cols = if allow_threads {
2031 POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2032 } else {
2033 self._apply_columns(&|s| s.take_unchecked(idx))
2034 };
2035 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2036 }
2037
2038 /// # Safety
2039 /// The indices must be in-bounds.
2040 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2041 self.take_slice_unchecked_impl(idx, true)
2042 }
2043
2044 /// # Safety
2045 /// The indices must be in-bounds.
2046 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2047 let cols = if allow_threads {
2048 POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2049 } else {
2050 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2051 };
2052 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2053 }
2054
2055 /// Rename a column in the [`DataFrame`].
2056 ///
2057 /// # Example
2058 ///
2059 /// ```
2060 /// # use polars_core::prelude::*;
2061 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2062 /// let original_name = "foo";
2063 /// let new_name = "bar";
2064 /// df.rename(original_name, new_name.into())
2065 /// }
2066 /// ```
2067 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2068 if column == name.as_str() {
2069 return Ok(self);
2070 }
2071 polars_ensure!(
2072 !self.schema().contains(&name),
2073 Duplicate: "column rename attempted with already existing name \"{name}\""
2074 );
2075
2076 self.get_column_index(column)
2077 .and_then(|idx| self.columns.get_mut(idx))
2078 .ok_or_else(|| polars_err!(col_not_found = column))
2079 .map(|c| c.rename(name))?;
2080 Ok(self)
2081 }
2082
2083 /// Sort [`DataFrame`] in place.
2084 ///
2085 /// See [`DataFrame::sort`] for more instruction.
2086 pub fn sort_in_place(
2087 &mut self,
2088 by: impl IntoVec<PlSmallStr>,
2089 sort_options: SortMultipleOptions,
2090 ) -> PolarsResult<&mut Self> {
2091 let by_column = self.select_columns(by)?;
2092 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2093 Ok(self)
2094 }
2095
2096 #[doc(hidden)]
2097 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2098 pub fn sort_impl(
2099 &self,
2100 by_column: Vec<Column>,
2101 mut sort_options: SortMultipleOptions,
2102 slice: Option<(i64, usize)>,
2103 ) -> PolarsResult<Self> {
2104 if by_column.is_empty() {
2105 // If no columns selected, any order (including original order) is correct.
2106 return if let Some((offset, len)) = slice {
2107 Ok(self.slice(offset, len))
2108 } else {
2109 Ok(self.clone())
2110 };
2111 }
2112
2113 // note that the by_column argument also contains evaluated expression from
2114 // polars-lazy that may not even be present in this dataframe. therefore
2115 // when we try to set the first columns as sorted, we ignore the error as
2116 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2117 let first_descending = sort_options.descending[0];
2118 let first_by_column = by_column[0].name().to_string();
2119
2120 let set_sorted = |df: &mut DataFrame| {
2121 // Mark the first sort column as sorted; if the column does not exist it
2122 // is ok, because we sorted by an expression not present in the dataframe
2123 let _ = df.apply(&first_by_column, |s| {
2124 let mut s = s.clone();
2125 if first_descending {
2126 s.set_sorted_flag(IsSorted::Descending)
2127 } else {
2128 s.set_sorted_flag(IsSorted::Ascending)
2129 }
2130 s
2131 });
2132 };
2133 if self.is_empty() {
2134 let mut out = self.clone();
2135 set_sorted(&mut out);
2136 return Ok(out);
2137 }
2138
2139 if let Some((0, k)) = slice {
2140 if k < self.len() {
2141 return self.bottom_k_impl(k, by_column, sort_options);
2142 }
2143 }
2144 // Check if the required column is already sorted; if so we can exit early
2145 // We can do so when there is only one column to sort by, for multiple columns
2146 // it will be complicated to do so
2147 #[cfg(feature = "dtype-categorical")]
2148 let is_not_categorical_enum =
2149 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2150 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2151
2152 #[cfg(not(feature = "dtype-categorical"))]
2153 #[allow(non_upper_case_globals)]
2154 const is_not_categorical_enum: bool = true;
2155
2156 if by_column.len() == 1 && is_not_categorical_enum {
2157 let required_sorting = if sort_options.descending[0] {
2158 IsSorted::Descending
2159 } else {
2160 IsSorted::Ascending
2161 };
2162 // If null count is 0 then nulls_last doesnt matter
2163 // Safe to get value at last position since the dataframe is not empty (taken care above)
2164 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2165 && ((by_column[0].null_count() == 0)
2166 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2167 == sort_options.nulls_last[0]);
2168
2169 if no_sorting_required {
2170 return if let Some((offset, len)) = slice {
2171 Ok(self.slice(offset, len))
2172 } else {
2173 Ok(self.clone())
2174 };
2175 }
2176 }
2177
2178 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2179
2180 // a lot of indirection in both sorting and take
2181 let mut df = self.clone();
2182 let df = df.as_single_chunk_par();
2183 let mut take = match (by_column.len(), has_nested) {
2184 (1, false) => {
2185 let s = &by_column[0];
2186 let options = SortOptions {
2187 descending: sort_options.descending[0],
2188 nulls_last: sort_options.nulls_last[0],
2189 multithreaded: sort_options.multithreaded,
2190 maintain_order: sort_options.maintain_order,
2191 limit: sort_options.limit,
2192 };
2193 // fast path for a frame with a single series
2194 // no need to compute the sort indices and then take by these indices
2195 // simply sort and return as frame
2196 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2197 let mut out = s.sort_with(options)?;
2198 if let Some((offset, len)) = slice {
2199 out = out.slice(offset, len);
2200 }
2201 return Ok(out.into_frame());
2202 }
2203 s.arg_sort(options)
2204 },
2205 _ => {
2206 if sort_options.nulls_last.iter().all(|&x| x)
2207 || has_nested
2208 || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2209 {
2210 argsort_multiple_row_fmt(
2211 &by_column,
2212 sort_options.descending,
2213 sort_options.nulls_last,
2214 sort_options.multithreaded,
2215 )?
2216 } else {
2217 let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2218 first
2219 .as_materialized_series()
2220 .arg_sort_multiple(&other, &sort_options)?
2221 }
2222 },
2223 };
2224
2225 if let Some((offset, len)) = slice {
2226 take = take.slice(offset, len);
2227 }
2228
2229 // SAFETY:
2230 // the created indices are in bounds
2231 let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2232 set_sorted(&mut df);
2233 Ok(df)
2234 }
2235
2236 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2237 ///
2238 /// This dataframe does not necessarily have a specified schema and may be changed at any
2239 /// point. It is primarily used for debugging.
2240 pub fn _to_metadata(&self) -> DataFrame {
2241 let num_columns = self.columns.len();
2242
2243 let mut column_names =
2244 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2245 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2246 let mut sorted_asc_ca =
2247 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2248 let mut sorted_dsc_ca =
2249 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2250 let mut fast_explode_list_ca =
2251 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2252 let mut materialized_at_ca =
2253 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2254
2255 for col in &self.columns {
2256 let flags = col.get_flags();
2257
2258 let (repr, materialized_at) = match col {
2259 Column::Series(s) => ("series", s.materialized_at()),
2260 Column::Partitioned(_) => ("partitioned", None),
2261 Column::Scalar(_) => ("scalar", None),
2262 };
2263 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2264 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2265 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2266
2267 column_names.append_value(col.name().clone());
2268 repr_ca.append_value(repr);
2269 sorted_asc_ca.append_value(sorted_asc);
2270 sorted_dsc_ca.append_value(sorted_dsc);
2271 fast_explode_list_ca.append_value(fast_explode_list);
2272 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2273 }
2274
2275 unsafe {
2276 DataFrame::new_no_checks(
2277 self.width(),
2278 vec![
2279 column_names.finish().into_column(),
2280 repr_ca.finish().into_column(),
2281 sorted_asc_ca.finish().into_column(),
2282 sorted_dsc_ca.finish().into_column(),
2283 fast_explode_list_ca.finish().into_column(),
2284 materialized_at_ca.finish().into_column(),
2285 ],
2286 )
2287 }
2288 }
2289
2290 /// Return a sorted clone of this [`DataFrame`].
2291 ///
2292 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2293 /// # Example
2294 ///
2295 /// Sort by a single column with default options:
2296 /// ```
2297 /// # use polars_core::prelude::*;
2298 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2299 /// df.sort(["sepal_width"], Default::default())
2300 /// }
2301 /// ```
2302 /// Sort by a single column with specific order:
2303 /// ```
2304 /// # use polars_core::prelude::*;
2305 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2306 /// df.sort(
2307 /// ["sepal_width"],
2308 /// SortMultipleOptions::new()
2309 /// .with_order_descending(descending)
2310 /// )
2311 /// }
2312 /// ```
2313 /// Sort by multiple columns with specifying order for each column:
2314 /// ```
2315 /// # use polars_core::prelude::*;
2316 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2317 /// df.sort(
2318 /// ["sepal_width", "sepal_length"],
2319 /// SortMultipleOptions::new()
2320 /// .with_order_descending_multi([false, true])
2321 /// )
2322 /// }
2323 /// ```
2324 /// See [`SortMultipleOptions`] for more options.
2325 ///
2326 /// Also see [`DataFrame::sort_in_place`].
2327 pub fn sort(
2328 &self,
2329 by: impl IntoVec<PlSmallStr>,
2330 sort_options: SortMultipleOptions,
2331 ) -> PolarsResult<Self> {
2332 let mut df = self.clone();
2333 df.sort_in_place(by, sort_options)?;
2334 Ok(df)
2335 }
2336
2337 /// Replace a column with a [`Series`].
2338 ///
2339 /// # Example
2340 ///
2341 /// ```rust
2342 /// # use polars_core::prelude::*;
2343 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2344 /// "Area (km²)" => [9_833_520, 9_596_961])?;
2345 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2346 ///
2347 /// assert!(df.replace("Nation", s.clone()).is_err());
2348 /// assert!(df.replace("Country", s).is_ok());
2349 /// # Ok::<(), PolarsError>(())
2350 /// ```
2351 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2352 self.apply(column, |_| new_col.into_series())
2353 }
2354
2355 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2356 /// is that now the value of `column: &str` determines the name of the column and not the name
2357 /// of the `Series` passed to this method.
2358 pub fn replace_or_add<S: IntoSeries>(
2359 &mut self,
2360 column: PlSmallStr,
2361 new_col: S,
2362 ) -> PolarsResult<&mut Self> {
2363 let mut new_col = new_col.into_series();
2364 new_col.rename(column);
2365 self.with_column(new_col)
2366 }
2367
2368 /// Replace column at index `idx` with a [`Series`].
2369 ///
2370 /// # Example
2371 ///
2372 /// ```ignored
2373 /// # use polars_core::prelude::*;
2374 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2375 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2376 /// let mut df = DataFrame::new(vec![s0, s1])?;
2377 ///
2378 /// // Add 32 to get lowercase ascii values
2379 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2380 /// # Ok::<(), PolarsError>(())
2381 /// ```
2382 pub fn replace_column<C: IntoColumn>(
2383 &mut self,
2384 index: usize,
2385 new_column: C,
2386 ) -> PolarsResult<&mut Self> {
2387 polars_ensure!(
2388 index < self.width(),
2389 ShapeMismatch:
2390 "unable to replace at index {}, the DataFrame has only {} columns",
2391 index, self.width(),
2392 );
2393 let mut new_column = new_column.into_column();
2394 polars_ensure!(
2395 new_column.len() == self.height(),
2396 ShapeMismatch:
2397 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2398 new_column.len(), self.height(),
2399 );
2400 let old_col = &mut self.columns[index];
2401 mem::swap(old_col, &mut new_column);
2402 self.clear_schema();
2403 Ok(self)
2404 }
2405
2406 /// Apply a closure to a column. This is the recommended way to do in place modification.
2407 ///
2408 /// # Example
2409 ///
2410 /// ```rust
2411 /// # use polars_core::prelude::*;
2412 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2413 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2414 /// let mut df = DataFrame::new(vec![s0, s1])?;
2415 ///
2416 /// fn str_to_len(str_val: &Column) -> Column {
2417 /// str_val.str()
2418 /// .unwrap()
2419 /// .into_iter()
2420 /// .map(|opt_name: Option<&str>| {
2421 /// opt_name.map(|name: &str| name.len() as u32)
2422 /// })
2423 /// .collect::<UInt32Chunked>()
2424 /// .into_column()
2425 /// }
2426 ///
2427 /// // Replace the names column by the length of the names.
2428 /// df.apply("names", str_to_len);
2429 /// # Ok::<(), PolarsError>(())
2430 /// ```
2431 /// Results in:
2432 ///
2433 /// ```text
2434 /// +--------+-------+
2435 /// | foo | |
2436 /// | --- | names |
2437 /// | str | u32 |
2438 /// +========+=======+
2439 /// | "ham" | 4 |
2440 /// +--------+-------+
2441 /// | "spam" | 6 |
2442 /// +--------+-------+
2443 /// | "egg" | 3 |
2444 /// +--------+-------+
2445 /// ```
2446 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2447 where
2448 F: FnOnce(&Column) -> C,
2449 C: IntoColumn,
2450 {
2451 let idx = self.check_name_to_idx(name)?;
2452 self.apply_at_idx(idx, f)
2453 }
2454
2455 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2456 /// modification.
2457 ///
2458 /// # Example
2459 ///
2460 /// ```rust
2461 /// # use polars_core::prelude::*;
2462 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2463 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2464 /// let mut df = DataFrame::new(vec![s0, s1])?;
2465 ///
2466 /// // Add 32 to get lowercase ascii values
2467 /// df.apply_at_idx(1, |s| s + 32);
2468 /// # Ok::<(), PolarsError>(())
2469 /// ```
2470 /// Results in:
2471 ///
2472 /// ```text
2473 /// +--------+-------+
2474 /// | foo | ascii |
2475 /// | --- | --- |
2476 /// | str | i32 |
2477 /// +========+=======+
2478 /// | "ham" | 102 |
2479 /// +--------+-------+
2480 /// | "spam" | 111 |
2481 /// +--------+-------+
2482 /// | "egg" | 111 |
2483 /// +--------+-------+
2484 /// ```
2485 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2486 where
2487 F: FnOnce(&Column) -> C,
2488 C: IntoColumn,
2489 {
2490 let df_height = self.height();
2491 let width = self.width();
2492 let col = self.columns.get_mut(idx).ok_or_else(|| {
2493 polars_err!(
2494 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2495 idx, width
2496 )
2497 })?;
2498 let name = col.name().clone();
2499 let new_col = f(col).into_column();
2500 match new_col.len() {
2501 1 => {
2502 let new_col = new_col.new_from_index(0, df_height);
2503 let _ = mem::replace(col, new_col);
2504 },
2505 len if (len == df_height) => {
2506 let _ = mem::replace(col, new_col);
2507 },
2508 len => polars_bail!(
2509 ShapeMismatch:
2510 "resulting Series has length {} while the DataFrame has height {}",
2511 len, df_height
2512 ),
2513 }
2514
2515 // make sure the name remains the same after applying the closure
2516 unsafe {
2517 let col = self.columns.get_unchecked_mut(idx);
2518 col.rename(name);
2519 }
2520 Ok(self)
2521 }
2522
2523 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2524 /// modification.
2525 ///
2526 /// # Example
2527 ///
2528 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2529 ///
2530 /// ```rust
2531 /// # use polars_core::prelude::*;
2532 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2533 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2534 /// let mut df = DataFrame::new(vec![s0, s1])?;
2535 ///
2536 /// let idx = vec![0, 1, 4];
2537 ///
2538 /// df.try_apply("foo", |c| {
2539 /// c.str()?
2540 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2541 /// });
2542 /// # Ok::<(), PolarsError>(())
2543 /// ```
2544 /// Results in:
2545 ///
2546 /// ```text
2547 /// +---------------------+--------+
2548 /// | foo | values |
2549 /// | --- | --- |
2550 /// | str | i32 |
2551 /// +=====================+========+
2552 /// | "ham-is-modified" | 1 |
2553 /// +---------------------+--------+
2554 /// | "spam-is-modified" | 2 |
2555 /// +---------------------+--------+
2556 /// | "egg" | 3 |
2557 /// +---------------------+--------+
2558 /// | "bacon" | 4 |
2559 /// +---------------------+--------+
2560 /// | "quack-is-modified" | 5 |
2561 /// +---------------------+--------+
2562 /// ```
2563 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2564 where
2565 F: FnOnce(&Column) -> PolarsResult<C>,
2566 C: IntoColumn,
2567 {
2568 let width = self.width();
2569 let col = self.columns.get_mut(idx).ok_or_else(|| {
2570 polars_err!(
2571 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2572 idx, width
2573 )
2574 })?;
2575 let name = col.name().clone();
2576
2577 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2578
2579 // make sure the name remains the same after applying the closure
2580 unsafe {
2581 let col = self.columns.get_unchecked_mut(idx);
2582 col.rename(name);
2583 }
2584 Ok(self)
2585 }
2586
2587 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2588 /// modification.
2589 ///
2590 /// # Example
2591 ///
2592 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2593 ///
2594 /// ```rust
2595 /// # use polars_core::prelude::*;
2596 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2597 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2598 /// let mut df = DataFrame::new(vec![s0, s1])?;
2599 ///
2600 /// // create a mask
2601 /// let values = df.column("values")?.as_materialized_series();
2602 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2603 ///
2604 /// df.try_apply("foo", |c| {
2605 /// c.str()?
2606 /// .set(&mask, Some("not_within_bounds"))
2607 /// });
2608 /// # Ok::<(), PolarsError>(())
2609 /// ```
2610 /// Results in:
2611 ///
2612 /// ```text
2613 /// +---------------------+--------+
2614 /// | foo | values |
2615 /// | --- | --- |
2616 /// | str | i32 |
2617 /// +=====================+========+
2618 /// | "not_within_bounds" | 1 |
2619 /// +---------------------+--------+
2620 /// | "spam" | 2 |
2621 /// +---------------------+--------+
2622 /// | "egg" | 3 |
2623 /// +---------------------+--------+
2624 /// | "bacon" | 4 |
2625 /// +---------------------+--------+
2626 /// | "not_within_bounds" | 5 |
2627 /// +---------------------+--------+
2628 /// ```
2629 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2630 where
2631 F: FnOnce(&Series) -> PolarsResult<C>,
2632 C: IntoColumn,
2633 {
2634 let idx = self.try_get_column_index(column)?;
2635 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2636 }
2637
2638 /// Slice the [`DataFrame`] along the rows.
2639 ///
2640 /// # Example
2641 ///
2642 /// ```rust
2643 /// # use polars_core::prelude::*;
2644 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2645 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2646 /// let sl: DataFrame = df.slice(2, 3);
2647 ///
2648 /// assert_eq!(sl.shape(), (3, 2));
2649 /// println!("{}", sl);
2650 /// # Ok::<(), PolarsError>(())
2651 /// ```
2652 /// Output:
2653 /// ```text
2654 /// shape: (3, 2)
2655 /// +-------+-------+
2656 /// | Fruit | Color |
2657 /// | --- | --- |
2658 /// | str | str |
2659 /// +=======+=======+
2660 /// | Grape | White |
2661 /// +-------+-------+
2662 /// | Fig | White |
2663 /// +-------+-------+
2664 /// | Fig | Red |
2665 /// +-------+-------+
2666 /// ```
2667 #[must_use]
2668 pub fn slice(&self, offset: i64, length: usize) -> Self {
2669 if offset == 0 && length == self.height() {
2670 return self.clone();
2671 }
2672 if length == 0 {
2673 return self.clear();
2674 }
2675 let col = self
2676 .columns
2677 .iter()
2678 .map(|s| s.slice(offset, length))
2679 .collect::<Vec<_>>();
2680
2681 let height = if let Some(fst) = col.first() {
2682 fst.len()
2683 } else {
2684 let (_, length) = slice_offsets(offset, length, self.height());
2685 length
2686 };
2687
2688 unsafe { DataFrame::new_no_checks(height, col) }
2689 }
2690
2691 /// Split [`DataFrame`] at the given `offset`.
2692 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2693 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2694
2695 let (idx, _) = slice_offsets(offset, 0, self.height());
2696
2697 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2698 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2699 (a, b)
2700 }
2701
2702 pub fn clear(&self) -> Self {
2703 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2704 unsafe { DataFrame::new_no_checks(0, col) }
2705 }
2706
2707 #[must_use]
2708 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2709 if offset == 0 && length == self.height() {
2710 return self.clone();
2711 }
2712 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2713 unsafe { DataFrame::new_no_checks(length, columns) }
2714 }
2715
2716 #[must_use]
2717 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2718 if offset == 0 && length == self.height() {
2719 return self.clone();
2720 }
2721 // @scalar-opt
2722 let columns = self._apply_columns(&|s| {
2723 let mut out = s.slice(offset, length);
2724 out.shrink_to_fit();
2725 out
2726 });
2727 unsafe { DataFrame::new_no_checks(length, columns) }
2728 }
2729
2730 /// Get the head of the [`DataFrame`].
2731 ///
2732 /// # Example
2733 ///
2734 /// ```rust
2735 /// # use polars_core::prelude::*;
2736 /// let countries: DataFrame =
2737 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2738 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2739 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2740 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2741 /// assert_eq!(countries.shape(), (5, 4));
2742 ///
2743 /// println!("{}", countries.head(Some(3)));
2744 /// # Ok::<(), PolarsError>(())
2745 /// ```
2746 ///
2747 /// Output:
2748 ///
2749 /// ```text
2750 /// shape: (3, 4)
2751 /// +--------------------+---------------+---------------+------------+
2752 /// | Rank by GDP (2021) | Continent | Country | Capital |
2753 /// | --- | --- | --- | --- |
2754 /// | i32 | str | str | str |
2755 /// +====================+===============+===============+============+
2756 /// | 1 | North America | United States | Washington |
2757 /// +--------------------+---------------+---------------+------------+
2758 /// | 2 | Asia | China | Beijing |
2759 /// +--------------------+---------------+---------------+------------+
2760 /// | 3 | Asia | Japan | Tokyo |
2761 /// +--------------------+---------------+---------------+------------+
2762 /// ```
2763 #[must_use]
2764 pub fn head(&self, length: Option<usize>) -> Self {
2765 let col = self
2766 .columns
2767 .iter()
2768 .map(|c| c.head(length))
2769 .collect::<Vec<_>>();
2770
2771 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2772 let height = usize::min(height, self.height());
2773 unsafe { DataFrame::new_no_checks(height, col) }
2774 }
2775
2776 /// Get the tail of the [`DataFrame`].
2777 ///
2778 /// # Example
2779 ///
2780 /// ```rust
2781 /// # use polars_core::prelude::*;
2782 /// let countries: DataFrame =
2783 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2784 /// "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2785 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2786 /// assert_eq!(countries.shape(), (5, 3));
2787 ///
2788 /// println!("{}", countries.tail(Some(2)));
2789 /// # Ok::<(), PolarsError>(())
2790 /// ```
2791 ///
2792 /// Output:
2793 ///
2794 /// ```text
2795 /// shape: (2, 3)
2796 /// +-------------+--------------------+---------+
2797 /// | Rank (2021) | Apple Price (€/kg) | Country |
2798 /// | --- | --- | --- |
2799 /// | i32 | f64 | str |
2800 /// +=============+====================+=========+
2801 /// | 108 | 0.63 | Syria |
2802 /// +-------------+--------------------+---------+
2803 /// | 109 | 0.63 | Turkey |
2804 /// +-------------+--------------------+---------+
2805 /// ```
2806 #[must_use]
2807 pub fn tail(&self, length: Option<usize>) -> Self {
2808 let col = self
2809 .columns
2810 .iter()
2811 .map(|c| c.tail(length))
2812 .collect::<Vec<_>>();
2813
2814 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2815 let height = usize::min(height, self.height());
2816 unsafe { DataFrame::new_no_checks(height, col) }
2817 }
2818
2819 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2820 ///
2821 /// # Panics
2822 ///
2823 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2824 ///
2825 /// This responsibility is left to the caller as we don't want to take mutable references here,
2826 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2827 /// as well.
2828 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2829 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2830 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2831 // as we must allocate arrow strings/binaries.
2832 let must_convert = compat_level.0 == 0;
2833 let parallel = parallel
2834 && must_convert
2835 && self.columns.len() > 1
2836 && self
2837 .columns
2838 .iter()
2839 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2840
2841 RecordBatchIter {
2842 columns: &self.columns,
2843 schema: Arc::new(
2844 self.columns
2845 .iter()
2846 .map(|c| c.field().to_arrow(compat_level))
2847 .collect(),
2848 ),
2849 idx: 0,
2850 n_chunks: self.first_col_n_chunks(),
2851 compat_level,
2852 parallel,
2853 }
2854 }
2855
2856 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2857 ///
2858 /// # Panics
2859 ///
2860 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2861 ///
2862 /// This responsibility is left to the caller as we don't want to take mutable references here,
2863 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2864 /// as well.
2865 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2866 debug_assert!(!self.should_rechunk());
2867 PhysRecordBatchIter {
2868 schema: Arc::new(
2869 self.get_columns()
2870 .iter()
2871 .map(|c| c.field().to_arrow(CompatLevel::newest()))
2872 .collect(),
2873 ),
2874 arr_iters: self
2875 .materialized_column_iter()
2876 .map(|s| s.chunks().iter())
2877 .collect(),
2878 }
2879 }
2880
2881 /// Get a [`DataFrame`] with all the columns in reversed order.
2882 #[must_use]
2883 pub fn reverse(&self) -> Self {
2884 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2885 unsafe { DataFrame::new_no_checks(self.height(), col) }
2886 }
2887
2888 /// Shift the values by a given period and fill the parts that will be empty due to this operation
2889 /// with `Nones`.
2890 ///
2891 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2892 #[must_use]
2893 pub fn shift(&self, periods: i64) -> Self {
2894 let col = self._apply_columns_par(&|s| s.shift(periods));
2895 unsafe { DataFrame::new_no_checks(self.height(), col) }
2896 }
2897
2898 /// Replace None values with one of the following strategies:
2899 /// * Forward fill (replace None with the previous value)
2900 /// * Backward fill (replace None with the next value)
2901 /// * Mean fill (replace None with the mean of the whole array)
2902 /// * Min fill (replace None with the minimum of the whole array)
2903 /// * Max fill (replace None with the maximum of the whole array)
2904 ///
2905 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2906 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2907 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2908
2909 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2910 }
2911
2912 /// Pipe different functions/ closure operations that work on a DataFrame together.
2913 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2914 where
2915 F: Fn(DataFrame) -> PolarsResult<B>,
2916 {
2917 f(self)
2918 }
2919
2920 /// Pipe different functions/ closure operations that work on a DataFrame together.
2921 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2922 where
2923 F: Fn(&mut DataFrame) -> PolarsResult<B>,
2924 {
2925 f(self)
2926 }
2927
2928 /// Pipe different functions/ closure operations that work on a DataFrame together.
2929 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2930 where
2931 F: Fn(DataFrame, Args) -> PolarsResult<B>,
2932 {
2933 f(self, args)
2934 }
2935
2936 /// Drop duplicate rows from a [`DataFrame`].
2937 /// *This fails when there is a column of type List in DataFrame*
2938 ///
2939 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2940 ///
2941 /// # Example
2942 ///
2943 /// ```no_run
2944 /// # use polars_core::prelude::*;
2945 /// let df = df! {
2946 /// "flt" => [1., 1., 2., 2., 3., 3.],
2947 /// "int" => [1, 1, 2, 2, 3, 3, ],
2948 /// "str" => ["a", "a", "b", "b", "c", "c"]
2949 /// }?;
2950 ///
2951 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2952 /// # Ok::<(), PolarsError>(())
2953 /// ```
2954 /// Returns
2955 ///
2956 /// ```text
2957 /// +-----+-----+-----+
2958 /// | flt | int | str |
2959 /// | --- | --- | --- |
2960 /// | f64 | i32 | str |
2961 /// +=====+=====+=====+
2962 /// | 1 | 1 | "a" |
2963 /// +-----+-----+-----+
2964 /// | 2 | 2 | "b" |
2965 /// +-----+-----+-----+
2966 /// | 3 | 3 | "c" |
2967 /// +-----+-----+-----+
2968 /// ```
2969 #[cfg(feature = "algorithm_group_by")]
2970 pub fn unique_stable(
2971 &self,
2972 subset: Option<&[String]>,
2973 keep: UniqueKeepStrategy,
2974 slice: Option<(i64, usize)>,
2975 ) -> PolarsResult<DataFrame> {
2976 self.unique_impl(
2977 true,
2978 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2979 keep,
2980 slice,
2981 )
2982 }
2983
2984 /// Unstable distinct. See [`DataFrame::unique_stable`].
2985 #[cfg(feature = "algorithm_group_by")]
2986 pub fn unique<I, S>(
2987 &self,
2988 subset: Option<&[String]>,
2989 keep: UniqueKeepStrategy,
2990 slice: Option<(i64, usize)>,
2991 ) -> PolarsResult<DataFrame> {
2992 self.unique_impl(
2993 false,
2994 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2995 keep,
2996 slice,
2997 )
2998 }
2999
3000 #[cfg(feature = "algorithm_group_by")]
3001 pub fn unique_impl(
3002 &self,
3003 maintain_order: bool,
3004 subset: Option<Vec<PlSmallStr>>,
3005 keep: UniqueKeepStrategy,
3006 slice: Option<(i64, usize)>,
3007 ) -> PolarsResult<Self> {
3008 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3009 let mut df = self.clone();
3010 // take on multiple chunks is terrible
3011 df.as_single_chunk_par();
3012
3013 let columns = match (keep, maintain_order) {
3014 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3015 let gb = df.group_by_stable(names)?;
3016 let groups = gb.get_groups();
3017 let (offset, len) = slice.unwrap_or((0, groups.len()));
3018 let groups = groups.slice(offset, len);
3019 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3020 },
3021 (UniqueKeepStrategy::Last, true) => {
3022 // maintain order by last values, so the sorted groups are not correct as they
3023 // are sorted by the first value
3024 let gb = df.group_by_stable(names)?;
3025 let groups = gb.get_groups();
3026
3027 let last_idx: NoNull<IdxCa> = groups
3028 .iter()
3029 .map(|g| match g {
3030 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3031 GroupsIndicator::Slice([first, len]) => first + len - 1,
3032 })
3033 .collect();
3034
3035 let mut last_idx = last_idx.into_inner().sort(false);
3036
3037 if let Some((offset, len)) = slice {
3038 last_idx = last_idx.slice(offset, len);
3039 }
3040
3041 let last_idx = NoNull::new(last_idx);
3042 let out = unsafe { df.take_unchecked(&last_idx) };
3043 return Ok(out);
3044 },
3045 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3046 let gb = df.group_by(names)?;
3047 let groups = gb.get_groups();
3048 let (offset, len) = slice.unwrap_or((0, groups.len()));
3049 let groups = groups.slice(offset, len);
3050 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3051 },
3052 (UniqueKeepStrategy::Last, false) => {
3053 let gb = df.group_by(names)?;
3054 let groups = gb.get_groups();
3055 let (offset, len) = slice.unwrap_or((0, groups.len()));
3056 let groups = groups.slice(offset, len);
3057 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3058 },
3059 (UniqueKeepStrategy::None, _) => {
3060 let df_part = df.select(names)?;
3061 let mask = df_part.is_unique()?;
3062 let mut filtered = df.filter(&mask)?;
3063
3064 if let Some((offset, len)) = slice {
3065 filtered = filtered.slice(offset, len);
3066 }
3067 return Ok(filtered);
3068 },
3069 };
3070 let height = Self::infer_height(&columns);
3071 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3072 }
3073
3074 /// Get a mask of all the unique rows in the [`DataFrame`].
3075 ///
3076 /// # Example
3077 ///
3078 /// ```no_run
3079 /// # use polars_core::prelude::*;
3080 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3081 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3082 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3083 ///
3084 /// assert!(ca.all());
3085 /// # Ok::<(), PolarsError>(())
3086 /// ```
3087 #[cfg(feature = "algorithm_group_by")]
3088 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3089 let gb = self.group_by(self.get_column_names_owned())?;
3090 let groups = gb.get_groups();
3091 Ok(is_unique_helper(
3092 groups,
3093 self.height() as IdxSize,
3094 true,
3095 false,
3096 ))
3097 }
3098
3099 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3100 ///
3101 /// # Example
3102 ///
3103 /// ```no_run
3104 /// # use polars_core::prelude::*;
3105 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3106 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3107 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3108 ///
3109 /// assert!(!ca.all());
3110 /// # Ok::<(), PolarsError>(())
3111 /// ```
3112 #[cfg(feature = "algorithm_group_by")]
3113 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3114 let gb = self.group_by(self.get_column_names_owned())?;
3115 let groups = gb.get_groups();
3116 Ok(is_unique_helper(
3117 groups,
3118 self.height() as IdxSize,
3119 false,
3120 true,
3121 ))
3122 }
3123
3124 /// Create a new [`DataFrame`] that shows the null counts per column.
3125 #[must_use]
3126 pub fn null_count(&self) -> Self {
3127 let cols = self
3128 .columns
3129 .iter()
3130 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3131 .collect();
3132 unsafe { Self::new_no_checks(1, cols) }
3133 }
3134
3135 /// Hash and combine the row values
3136 #[cfg(feature = "row_hash")]
3137 pub fn hash_rows(
3138 &mut self,
3139 hasher_builder: Option<PlSeedableRandomStateQuality>,
3140 ) -> PolarsResult<UInt64Chunked> {
3141 let dfs = split_df(self, POOL.current_num_threads(), false);
3142 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3143
3144 let mut iter = cas.into_iter();
3145 let mut acc_ca = iter.next().unwrap();
3146 for ca in iter {
3147 acc_ca.append(&ca)?;
3148 }
3149 Ok(acc_ca.rechunk().into_owned())
3150 }
3151
3152 /// Get the supertype of the columns in this DataFrame
3153 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3154 self.columns
3155 .iter()
3156 .map(|s| Ok(s.dtype().clone()))
3157 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3158 }
3159
3160 /// Take by index values given by the slice `idx`.
3161 /// # Warning
3162 /// Be careful with allowing threads when calling this in a large hot loop
3163 /// every thread split may be on rayon stack and lead to SO
3164 #[doc(hidden)]
3165 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3166 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3167 }
3168
3169 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3170 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3171 ///
3172 /// # Warning
3173 /// Be careful with allowing threads when calling this in a large hot loop
3174 /// every thread split may be on rayon stack and lead to SO
3175 #[doc(hidden)]
3176 pub unsafe fn _take_unchecked_slice_sorted(
3177 &self,
3178 idx: &[IdxSize],
3179 allow_threads: bool,
3180 sorted: IsSorted,
3181 ) -> Self {
3182 #[cfg(debug_assertions)]
3183 {
3184 if idx.len() > 2 {
3185 match sorted {
3186 IsSorted::Ascending => {
3187 assert!(idx[0] <= idx[idx.len() - 1]);
3188 },
3189 IsSorted::Descending => {
3190 assert!(idx[0] >= idx[idx.len() - 1]);
3191 },
3192 _ => {},
3193 }
3194 }
3195 }
3196 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3197 ca.set_sorted_flag(sorted);
3198 self.take_unchecked_impl(&ca, allow_threads)
3199 }
3200
3201 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3202 #[doc(hidden)]
3203 pub fn _partition_by_impl(
3204 &self,
3205 cols: &[PlSmallStr],
3206 stable: bool,
3207 include_key: bool,
3208 parallel: bool,
3209 ) -> PolarsResult<Vec<DataFrame>> {
3210 let selected_keys = self.select_columns(cols.iter().cloned())?;
3211 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3212 let groups = groups.take_groups();
3213
3214 // drop key columns prior to calculation if requested
3215 let df = if include_key {
3216 self.clone()
3217 } else {
3218 self.drop_many(cols.iter().cloned())
3219 };
3220
3221 if parallel {
3222 // don't parallelize this
3223 // there is a lot of parallelization in take and this may easily SO
3224 POOL.install(|| {
3225 match groups.as_ref() {
3226 GroupsType::Idx(idx) => {
3227 // Rechunk as the gather may rechunk for every group #17562.
3228 let mut df = df.clone();
3229 df.as_single_chunk_par();
3230 Ok(idx
3231 .into_par_iter()
3232 .map(|(_, group)| {
3233 // groups are in bounds
3234 unsafe {
3235 df._take_unchecked_slice_sorted(
3236 group,
3237 false,
3238 IsSorted::Ascending,
3239 )
3240 }
3241 })
3242 .collect())
3243 },
3244 GroupsType::Slice { groups, .. } => Ok(groups
3245 .into_par_iter()
3246 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3247 .collect()),
3248 }
3249 })
3250 } else {
3251 match groups.as_ref() {
3252 GroupsType::Idx(idx) => {
3253 // Rechunk as the gather may rechunk for every group #17562.
3254 let mut df = df.clone();
3255 df.as_single_chunk();
3256 Ok(idx
3257 .into_iter()
3258 .map(|(_, group)| {
3259 // groups are in bounds
3260 unsafe {
3261 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3262 }
3263 })
3264 .collect())
3265 },
3266 GroupsType::Slice { groups, .. } => Ok(groups
3267 .iter()
3268 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3269 .collect()),
3270 }
3271 }
3272 }
3273
3274 /// Split into multiple DataFrames partitioned by groups
3275 #[cfg(feature = "partition_by")]
3276 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3277 where
3278 I: IntoIterator<Item = S>,
3279 S: Into<PlSmallStr>,
3280 {
3281 let cols = cols
3282 .into_iter()
3283 .map(Into::into)
3284 .collect::<Vec<PlSmallStr>>();
3285 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3286 }
3287
3288 /// Split into multiple DataFrames partitioned by groups
3289 /// Order of the groups are maintained.
3290 #[cfg(feature = "partition_by")]
3291 pub fn partition_by_stable<I, S>(
3292 &self,
3293 cols: I,
3294 include_key: bool,
3295 ) -> PolarsResult<Vec<DataFrame>>
3296 where
3297 I: IntoIterator<Item = S>,
3298 S: Into<PlSmallStr>,
3299 {
3300 let cols = cols
3301 .into_iter()
3302 .map(Into::into)
3303 .collect::<Vec<PlSmallStr>>();
3304 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3305 }
3306
3307 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3308 /// inserted as columns.
3309 #[cfg(feature = "dtype-struct")]
3310 pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3311 let cols = cols.into_vec();
3312 self.unnest_impl(cols.into_iter().collect())
3313 }
3314
3315 #[cfg(feature = "dtype-struct")]
3316 fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3317 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3318 let mut count = 0;
3319 for s in &self.columns {
3320 if cols.contains(s.name()) {
3321 let ca = s.struct_()?.clone();
3322 new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3323 count += 1;
3324 } else {
3325 new_cols.push(s.clone())
3326 }
3327 }
3328 if count != cols.len() {
3329 // one or more columns not found
3330 // the code below will return an error with the missing name
3331 let schema = self.schema();
3332 for col in cols {
3333 let _ = schema
3334 .get(col.as_str())
3335 .ok_or_else(|| polars_err!(col_not_found = col))?;
3336 }
3337 }
3338 DataFrame::new(new_cols)
3339 }
3340
3341 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3342 cols.first().map_or(0, Column::len)
3343 }
3344
3345 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3346 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3347 // append_chunk or something like this. It is just quite difficult to make that safe.
3348 let df = DataFrame::from(rb);
3349 polars_ensure!(
3350 self.schema() == df.schema(),
3351 SchemaMismatch: "cannot append record batch with different schema\n\n
3352 Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3353 );
3354 self.vstack_mut_owned_unchecked(df);
3355 Ok(())
3356 }
3357}
3358
3359pub struct RecordBatchIter<'a> {
3360 columns: &'a Vec<Column>,
3361 schema: ArrowSchemaRef,
3362 idx: usize,
3363 n_chunks: usize,
3364 compat_level: CompatLevel,
3365 parallel: bool,
3366}
3367
3368impl Iterator for RecordBatchIter<'_> {
3369 type Item = RecordBatch;
3370
3371 fn next(&mut self) -> Option<Self::Item> {
3372 if self.idx >= self.n_chunks {
3373 return None;
3374 }
3375
3376 // Create a batch of the columns with the same chunk no.
3377 let batch_cols: Vec<ArrayRef> = if self.parallel {
3378 let iter = self
3379 .columns
3380 .par_iter()
3381 .map(Column::as_materialized_series)
3382 .map(|s| s.to_arrow(self.idx, self.compat_level));
3383 POOL.install(|| iter.collect())
3384 } else {
3385 self.columns
3386 .iter()
3387 .map(Column::as_materialized_series)
3388 .map(|s| s.to_arrow(self.idx, self.compat_level))
3389 .collect()
3390 };
3391 self.idx += 1;
3392
3393 let length = batch_cols.first().map_or(0, |arr| arr.len());
3394 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3395 }
3396
3397 fn size_hint(&self) -> (usize, Option<usize>) {
3398 let n = self.n_chunks - self.idx;
3399 (n, Some(n))
3400 }
3401}
3402
3403pub struct PhysRecordBatchIter<'a> {
3404 schema: ArrowSchemaRef,
3405 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3406}
3407
3408impl Iterator for PhysRecordBatchIter<'_> {
3409 type Item = RecordBatch;
3410
3411 fn next(&mut self) -> Option<Self::Item> {
3412 let arrs = self
3413 .arr_iters
3414 .iter_mut()
3415 .map(|phys_iter| phys_iter.next().cloned())
3416 .collect::<Option<Vec<_>>>()?;
3417
3418 let length = arrs.first().map_or(0, |arr| arr.len());
3419 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3420 }
3421
3422 fn size_hint(&self) -> (usize, Option<usize>) {
3423 if let Some(iter) = self.arr_iters.first() {
3424 iter.size_hint()
3425 } else {
3426 (0, None)
3427 }
3428 }
3429}
3430
3431impl Default for DataFrame {
3432 fn default() -> Self {
3433 DataFrame::empty()
3434 }
3435}
3436
3437impl From<DataFrame> for Vec<Column> {
3438 fn from(df: DataFrame) -> Self {
3439 df.columns
3440 }
3441}
3442
3443// utility to test if we can vstack/extend the columns
3444fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3445 polars_ensure!(
3446 left.name() == right.name(),
3447 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3448 left.name(), right.name(),
3449 );
3450 Ok(())
3451}
3452
3453#[cfg(test)]
3454mod test {
3455 use super::*;
3456
3457 fn create_frame() -> DataFrame {
3458 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3459 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3460 DataFrame::new(vec![s0, s1]).unwrap()
3461 }
3462
3463 #[test]
3464 #[cfg_attr(miri, ignore)]
3465 fn test_recordbatch_iterator() {
3466 let df = df!(
3467 "foo" => [1, 2, 3, 4, 5]
3468 )
3469 .unwrap();
3470 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3471 assert_eq!(5, iter.next().unwrap().len());
3472 assert!(iter.next().is_none());
3473 }
3474
3475 #[test]
3476 #[cfg_attr(miri, ignore)]
3477 fn test_select() {
3478 let df = create_frame();
3479 assert_eq!(
3480 df.column("days")
3481 .unwrap()
3482 .as_series()
3483 .unwrap()
3484 .equal(1)
3485 .unwrap()
3486 .sum(),
3487 Some(1)
3488 );
3489 }
3490
3491 #[test]
3492 #[cfg_attr(miri, ignore)]
3493 fn test_filter_broadcast_on_string_col() {
3494 let col_name = "some_col";
3495 let v = vec!["test".to_string()];
3496 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3497 let mut df = DataFrame::new(vec![s0]).unwrap();
3498
3499 df = df
3500 .filter(
3501 &df.column(col_name)
3502 .unwrap()
3503 .as_materialized_series()
3504 .equal("")
3505 .unwrap(),
3506 )
3507 .unwrap();
3508 assert_eq!(
3509 df.column(col_name)
3510 .unwrap()
3511 .as_materialized_series()
3512 .n_chunks(),
3513 1
3514 );
3515 }
3516
3517 #[test]
3518 #[cfg_attr(miri, ignore)]
3519 fn test_filter_broadcast_on_list_col() {
3520 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3521 let ll: ListChunked = [&s1].iter().copied().collect();
3522
3523 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3524 let new = ll.filter(&mask).unwrap();
3525
3526 assert_eq!(new.chunks.len(), 1);
3527 assert_eq!(new.len(), 0);
3528 }
3529
3530 #[test]
3531 fn slice() {
3532 let df = create_frame();
3533 let sliced_df = df.slice(0, 2);
3534 assert_eq!(sliced_df.shape(), (2, 2));
3535 }
3536
3537 #[test]
3538 fn rechunk_false() {
3539 let df = create_frame();
3540 assert!(!df.should_rechunk())
3541 }
3542
3543 #[test]
3544 fn rechunk_true() -> PolarsResult<()> {
3545 let mut base = df!(
3546 "a" => [1, 2, 3],
3547 "b" => [1, 2, 3]
3548 )?;
3549
3550 // Create a series with multiple chunks
3551 let mut s = Series::new("foo".into(), 0..2);
3552 let s2 = Series::new("bar".into(), 0..1);
3553 s.append(&s2)?;
3554
3555 // Append series to frame
3556 let out = base.with_column(s)?;
3557
3558 // Now we should rechunk
3559 assert!(out.should_rechunk());
3560 Ok(())
3561 }
3562
3563 #[test]
3564 fn test_duplicate_column() {
3565 let mut df = df! {
3566 "foo" => [1, 2, 3]
3567 }
3568 .unwrap();
3569 // check if column is replaced
3570 assert!(
3571 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3572 .is_ok()
3573 );
3574 assert!(
3575 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3576 .is_ok()
3577 );
3578 assert!(df.column("bar").is_ok())
3579 }
3580
3581 #[test]
3582 #[cfg_attr(miri, ignore)]
3583 fn distinct() {
3584 let df = df! {
3585 "flt" => [1., 1., 2., 2., 3., 3.],
3586 "int" => [1, 1, 2, 2, 3, 3, ],
3587 "str" => ["a", "a", "b", "b", "c", "c"]
3588 }
3589 .unwrap();
3590 let df = df
3591 .unique_stable(None, UniqueKeepStrategy::First, None)
3592 .unwrap()
3593 .sort(["flt"], SortMultipleOptions::default())
3594 .unwrap();
3595 let valid = df! {
3596 "flt" => [1., 2., 3.],
3597 "int" => [1, 2, 3],
3598 "str" => ["a", "b", "c"]
3599 }
3600 .unwrap();
3601 assert!(df.equals(&valid));
3602 }
3603
3604 #[test]
3605 fn test_vstack() {
3606 // check that it does not accidentally rechunks
3607 let mut df = df! {
3608 "flt" => [1., 1., 2., 2., 3., 3.],
3609 "int" => [1, 1, 2, 2, 3, 3, ],
3610 "str" => ["a", "a", "b", "b", "c", "c"]
3611 }
3612 .unwrap();
3613
3614 df.vstack_mut(&df.slice(0, 3)).unwrap();
3615 assert_eq!(df.first_col_n_chunks(), 2)
3616 }
3617
3618 #[test]
3619 fn test_vstack_on_empty_dataframe() {
3620 let mut df = DataFrame::empty();
3621
3622 let df_data = df! {
3623 "flt" => [1., 1., 2., 2., 3., 3.],
3624 "int" => [1, 1, 2, 2, 3, 3, ],
3625 "str" => ["a", "a", "b", "b", "c", "c"]
3626 }
3627 .unwrap();
3628
3629 df.vstack_mut(&df_data).unwrap();
3630 assert_eq!(df.height, 6)
3631 }
3632
3633 #[test]
3634 fn test_replace_or_add() -> PolarsResult<()> {
3635 let mut df = df!(
3636 "a" => [1, 2, 3],
3637 "b" => [1, 2, 3]
3638 )?;
3639
3640 // check that the new column is "c" and not "bar".
3641 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3642
3643 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3644 Ok(())
3645 }
3646
3647 #[test]
3648 fn test_unique_keep_none_with_slice() {
3649 let df = df! {
3650 "x" => [1, 2, 3, 2, 1]
3651 }
3652 .unwrap();
3653 let out = df
3654 .unique_stable(
3655 Some(&["x".to_string()][..]),
3656 UniqueKeepStrategy::None,
3657 Some((0, 2)),
3658 )
3659 .unwrap();
3660 let expected = df! {
3661 "x" => [3]
3662 }
3663 .unwrap();
3664 assert!(out.equals(&expected));
3665 }
3666}