polars_core/frame/group_by/
mod.rs

1use std::fmt::{Debug, Display, Formatter};
2use std::hash::Hash;
3
4use num_traits::NumCast;
5use polars_compute::rolling::QuantileMethod;
6use polars_utils::format_pl_smallstr;
7use polars_utils::hashing::DirtyHash;
8use rayon::prelude::*;
9
10use self::hashing::*;
11use crate::POOL;
12use crate::prelude::*;
13use crate::utils::{_set_partition_size, accumulate_dataframes_vertical};
14
15pub mod aggregations;
16pub mod expr;
17pub(crate) mod hashing;
18mod into_groups;
19mod position;
20
21pub use into_groups::*;
22pub use position::*;
23
24use crate::chunked_array::ops::row_encode::{
25    encode_rows_unordered, encode_rows_vertical_par_unordered,
26};
27
28impl DataFrame {
29    pub fn group_by_with_series(
30        &self,
31        mut by: Vec<Column>,
32        multithreaded: bool,
33        sorted: bool,
34    ) -> PolarsResult<GroupBy<'_>> {
35        polars_ensure!(
36            !by.is_empty(),
37            ComputeError: "at least one key is required in a group_by operation"
38        );
39
40        // Ensure all 'by' columns have the same common_height
41        // The condition self.width > 0 ensures we can still call this on a
42        // dummy dataframe where we provide the keys
43        let common_height = if self.width() > 0 {
44            self.height()
45        } else {
46            by.iter().map(|s| s.len()).max().expect("at least 1 key")
47        };
48        for by_key in by.iter_mut() {
49            if by_key.len() != common_height {
50                polars_ensure!(
51                    by_key.len() == 1,
52                    ShapeMismatch: "series used as keys should have the same length as the DataFrame"
53                );
54                *by_key = by_key.new_from_index(0, common_height)
55            }
56        }
57
58        let groups = if by.len() == 1 {
59            let column = &by[0];
60            column
61                .as_materialized_series()
62                .group_tuples(multithreaded, sorted)
63        } else if by.iter().any(|s| s.dtype().is_object()) {
64            #[cfg(feature = "object")]
65            {
66                let mut df = DataFrame::new(self.height(), by.clone()).unwrap();
67                let n = df.height();
68                let rows = df.to_av_rows();
69                let iter = (0..n).map(|i| rows.get(i));
70                Ok(group_by(iter, sorted))
71            }
72            #[cfg(not(feature = "object"))]
73            {
74                unreachable!()
75            }
76        } else {
77            // Skip null dtype.
78            let by = by
79                .iter()
80                .filter(|s| !s.dtype().is_null())
81                .cloned()
82                .collect::<Vec<_>>();
83            if by.is_empty() {
84                let groups = if self.height() == 0 {
85                    vec![]
86                } else {
87                    vec![[0, self.height() as IdxSize]]
88                };
89
90                Ok(GroupsType::new_slice(groups, false, true))
91            } else {
92                let rows = if multithreaded {
93                    encode_rows_vertical_par_unordered(&by)
94                } else {
95                    encode_rows_unordered(&by)
96                }?
97                .into_series();
98                rows.group_tuples(multithreaded, sorted)
99            }
100        };
101        Ok(GroupBy::new(self, by, groups?.into_sliceable(), None))
102    }
103
104    /// Group DataFrame using a Series column.
105    ///
106    /// # Example
107    ///
108    /// ```
109    /// use polars_core::prelude::*;
110    /// fn group_by_sum(df: &DataFrame) -> PolarsResult<DataFrame> {
111    ///     df.group_by(["column_name"])?
112    ///     .select(["agg_column_name"])
113    ///     .sum()
114    /// }
115    /// ```
116    pub fn group_by<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
117    where
118        I: IntoIterator<Item = S>,
119        S: AsRef<str>,
120    {
121        let selected_keys = self.select_to_vec(by)?;
122        self.group_by_with_series(selected_keys, true, false)
123    }
124
125    /// Group DataFrame using a Series column.
126    /// The groups are ordered by their smallest row index.
127    pub fn group_by_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
128    where
129        I: IntoIterator<Item = S>,
130        S: AsRef<str>,
131    {
132        let selected_keys = self.select_to_vec(by)?;
133        self.group_by_with_series(selected_keys, true, true)
134    }
135}
136
137/// Returned by a group_by operation on a DataFrame. This struct supports
138/// several aggregations.
139///
140/// Until described otherwise, the examples in this struct are performed on the following DataFrame:
141///
142/// ```ignore
143/// use polars_core::prelude::*;
144///
145/// let dates = &[
146/// "2020-08-21",
147/// "2020-08-21",
148/// "2020-08-22",
149/// "2020-08-23",
150/// "2020-08-22",
151/// ];
152/// // date format
153/// let fmt = "%Y-%m-%d";
154/// // create date series
155/// let s0 = DateChunked::parse_from_str_slice("date", dates, fmt)
156///         .into_series();
157/// // create temperature series
158/// let s1 = Series::new("temp".into(), [20, 10, 7, 9, 1]);
159/// // create rain series
160/// let s2 = Series::new("rain".into(), [0.2, 0.1, 0.3, 0.1, 0.01]);
161/// // create a new DataFrame
162/// let df = DataFrame::new_infer_height(vec![s0, s1, s2]).unwrap();
163/// println!("{:?}", df);
164/// ```
165///
166/// Outputs:
167///
168/// ```text
169/// +------------+------+------+
170/// | date       | temp | rain |
171/// | ---        | ---  | ---  |
172/// | Date       | i32  | f64  |
173/// +============+======+======+
174/// | 2020-08-21 | 20   | 0.2  |
175/// +------------+------+------+
176/// | 2020-08-21 | 10   | 0.1  |
177/// +------------+------+------+
178/// | 2020-08-22 | 7    | 0.3  |
179/// +------------+------+------+
180/// | 2020-08-23 | 9    | 0.1  |
181/// +------------+------+------+
182/// | 2020-08-22 | 1    | 0.01 |
183/// +------------+------+------+
184/// ```
185///
186#[derive(Debug, Clone)]
187pub struct GroupBy<'a> {
188    pub df: &'a DataFrame,
189    pub(crate) selected_keys: Vec<Column>,
190    // [first idx, [other idx]]
191    groups: GroupPositions,
192    // columns selected for aggregation
193    pub(crate) selected_agg: Option<Vec<PlSmallStr>>,
194}
195
196impl<'a> GroupBy<'a> {
197    pub fn new(
198        df: &'a DataFrame,
199        by: Vec<Column>,
200        groups: GroupPositions,
201        selected_agg: Option<Vec<PlSmallStr>>,
202    ) -> Self {
203        GroupBy {
204            df,
205            selected_keys: by,
206            groups,
207            selected_agg,
208        }
209    }
210
211    /// Select the column(s) that should be aggregated.
212    /// You can select a single column or a slice of columns.
213    ///
214    /// Note that making a selection with this method is not required. If you
215    /// skip it all columns (except for the keys) will be selected for aggregation.
216    #[must_use]
217    pub fn select<I: IntoIterator<Item = S>, S: Into<PlSmallStr>>(mut self, selection: I) -> Self {
218        self.selected_agg = Some(selection.into_iter().map(|s| s.into()).collect());
219        self
220    }
221
222    /// Get the internal representation of the GroupBy operation.
223    /// The Vec returned contains:
224    ///     (first_idx, [`Vec<indexes>`])
225    ///     Where second value in the tuple is a vector with all matching indexes.
226    pub fn get_groups(&self) -> &GroupPositions {
227        &self.groups
228    }
229
230    /// Get the internal representation of the GroupBy operation.
231    /// The Vec returned contains:
232    ///     (first_idx, [`Vec<indexes>`])
233    ///     Where second value in the tuple is a vector with all matching indexes.
234    ///
235    /// # Safety
236    /// Groups should always be in bounds of the `DataFrame` hold by this [`GroupBy`].
237    /// If you mutate it, you must hold that invariant.
238    pub unsafe fn get_groups_mut(&mut self) -> &mut GroupPositions {
239        &mut self.groups
240    }
241
242    pub fn into_groups(self) -> GroupPositions {
243        self.groups
244    }
245
246    pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec<Column> {
247        #[allow(unused_assignments)]
248        // needed to keep the lifetimes valid for this scope
249        let mut groups_owned = None;
250
251        let groups = if let Some((offset, len)) = slice {
252            groups_owned = Some(self.groups.slice(offset, len));
253            groups_owned.as_deref().unwrap()
254        } else {
255            &self.groups
256        };
257        POOL.install(|| {
258            self.selected_keys
259                .par_iter()
260                .map(Column::as_materialized_series)
261                .map(|s| {
262                    match groups {
263                        GroupsType::Idx(groups) => {
264                            // SAFETY: groups are always in bounds.
265                            let mut out = unsafe { s.take_slice_unchecked(groups.first()) };
266                            if groups.sorted {
267                                out.set_sorted_flag(s.is_sorted_flag());
268                            };
269                            out
270                        },
271                        GroupsType::Slice {
272                            groups,
273                            overlapping,
274                            monotonic: _,
275                        } => {
276                            if *overlapping && !groups.is_empty() {
277                                // Groups can be sliced.
278                                let offset = groups[0][0];
279                                let [upper_offset, upper_len] = groups[groups.len() - 1];
280                                return s.slice(
281                                    offset as i64,
282                                    ((upper_offset + upper_len) - offset) as usize,
283                                );
284                            }
285
286                            let indices = groups
287                                .iter()
288                                .map(|&[first, _len]| first)
289                                .collect_ca(PlSmallStr::EMPTY);
290                            // SAFETY: groups are always in bounds.
291                            let mut out = unsafe { s.take_unchecked(&indices) };
292                            // Sliced groups are always in order of discovery.
293                            out.set_sorted_flag(s.is_sorted_flag());
294                            out
295                        },
296                    }
297                })
298                .map(Column::from)
299                .collect()
300        })
301    }
302
303    pub fn keys(&self) -> Vec<Column> {
304        self.keys_sliced(None)
305    }
306
307    fn prepare_agg(&self) -> PolarsResult<(Vec<Column>, Vec<Column>)> {
308        let keys = self.keys();
309
310        let agg_col = match &self.selected_agg {
311            Some(selection) => self.df.select_to_vec(selection),
312            None => {
313                let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect();
314                let selection = self
315                    .df
316                    .columns()
317                    .iter()
318                    .map(|s| s.name())
319                    .filter(|a| !by.contains(a))
320                    .cloned()
321                    .collect::<Vec<_>>();
322
323                self.df.select_to_vec(selection.as_slice())
324            },
325        }?;
326
327        Ok((keys, agg_col))
328    }
329
330    /// Aggregate grouped series and compute the mean per group.
331    ///
332    /// # Example
333    ///
334    /// ```rust
335    /// # use polars_core::prelude::*;
336    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
337    ///     df.group_by(["date"])?.select(["temp", "rain"]).mean()
338    /// }
339    /// ```
340    /// Returns:
341    ///
342    /// ```text
343    /// +------------+-----------+-----------+
344    /// | date       | temp_mean | rain_mean |
345    /// | ---        | ---       | ---       |
346    /// | Date       | f64       | f64       |
347    /// +============+===========+===========+
348    /// | 2020-08-23 | 9         | 0.1       |
349    /// +------------+-----------+-----------+
350    /// | 2020-08-22 | 4         | 0.155     |
351    /// +------------+-----------+-----------+
352    /// | 2020-08-21 | 15        | 0.15      |
353    /// +------------+-----------+-----------+
354    /// ```
355    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
356    pub fn mean(&self) -> PolarsResult<DataFrame> {
357        let (mut cols, agg_cols) = self.prepare_agg()?;
358
359        for agg_col in agg_cols {
360            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Mean);
361            let mut agg = unsafe { agg_col.agg_mean(&self.groups) };
362            agg.rename(new_name);
363            cols.push(agg);
364        }
365
366        DataFrame::new_infer_height(cols)
367    }
368
369    /// Aggregate grouped series and compute the sum per group.
370    ///
371    /// # Example
372    ///
373    /// ```rust
374    /// # use polars_core::prelude::*;
375    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
376    ///     df.group_by(["date"])?.select(["temp"]).sum()
377    /// }
378    /// ```
379    /// Returns:
380    ///
381    /// ```text
382    /// +------------+----------+
383    /// | date       | temp_sum |
384    /// | ---        | ---      |
385    /// | Date       | i32      |
386    /// +============+==========+
387    /// | 2020-08-23 | 9        |
388    /// +------------+----------+
389    /// | 2020-08-22 | 8        |
390    /// +------------+----------+
391    /// | 2020-08-21 | 30       |
392    /// +------------+----------+
393    /// ```
394    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
395    pub fn sum(&self) -> PolarsResult<DataFrame> {
396        let (mut cols, agg_cols) = self.prepare_agg()?;
397
398        for agg_col in agg_cols {
399            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum);
400            let mut agg = unsafe { agg_col.agg_sum(&self.groups) };
401            agg.rename(new_name);
402            cols.push(agg);
403        }
404        DataFrame::new_infer_height(cols)
405    }
406
407    /// Aggregate grouped series and compute the minimal value per group.
408    ///
409    /// # Example
410    ///
411    /// ```rust
412    /// # use polars_core::prelude::*;
413    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
414    ///     df.group_by(["date"])?.select(["temp"]).min()
415    /// }
416    /// ```
417    /// Returns:
418    ///
419    /// ```text
420    /// +------------+----------+
421    /// | date       | temp_min |
422    /// | ---        | ---      |
423    /// | Date       | i32      |
424    /// +============+==========+
425    /// | 2020-08-23 | 9        |
426    /// +------------+----------+
427    /// | 2020-08-22 | 1        |
428    /// +------------+----------+
429    /// | 2020-08-21 | 10       |
430    /// +------------+----------+
431    /// ```
432    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
433    pub fn min(&self) -> PolarsResult<DataFrame> {
434        let (mut cols, agg_cols) = self.prepare_agg()?;
435        for agg_col in agg_cols {
436            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min);
437            let mut agg = unsafe { agg_col.agg_min(&self.groups) };
438            agg.rename(new_name);
439            cols.push(agg);
440        }
441        DataFrame::new_infer_height(cols)
442    }
443
444    /// Aggregate grouped series and compute the maximum value per group.
445    ///
446    /// # Example
447    ///
448    /// ```rust
449    /// # use polars_core::prelude::*;
450    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
451    ///     df.group_by(["date"])?.select(["temp"]).max()
452    /// }
453    /// ```
454    /// Returns:
455    ///
456    /// ```text
457    /// +------------+----------+
458    /// | date       | temp_max |
459    /// | ---        | ---      |
460    /// | Date       | i32      |
461    /// +============+==========+
462    /// | 2020-08-23 | 9        |
463    /// +------------+----------+
464    /// | 2020-08-22 | 7        |
465    /// +------------+----------+
466    /// | 2020-08-21 | 20       |
467    /// +------------+----------+
468    /// ```
469    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
470    pub fn max(&self) -> PolarsResult<DataFrame> {
471        let (mut cols, agg_cols) = self.prepare_agg()?;
472        for agg_col in agg_cols {
473            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max);
474            let mut agg = unsafe { agg_col.agg_max(&self.groups) };
475            agg.rename(new_name);
476            cols.push(agg);
477        }
478        DataFrame::new_infer_height(cols)
479    }
480
481    /// Aggregate grouped `Series` and find the first value per group.
482    ///
483    /// # Example
484    ///
485    /// ```rust
486    /// # use polars_core::prelude::*;
487    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
488    ///     df.group_by(["date"])?.select(["temp"]).first()
489    /// }
490    /// ```
491    /// Returns:
492    ///
493    /// ```text
494    /// +------------+------------+
495    /// | date       | temp_first |
496    /// | ---        | ---        |
497    /// | Date       | i32        |
498    /// +============+============+
499    /// | 2020-08-23 | 9          |
500    /// +------------+------------+
501    /// | 2020-08-22 | 7          |
502    /// +------------+------------+
503    /// | 2020-08-21 | 20         |
504    /// +------------+------------+
505    /// ```
506    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
507    pub fn first(&self) -> PolarsResult<DataFrame> {
508        let (mut cols, agg_cols) = self.prepare_agg()?;
509        for agg_col in agg_cols {
510            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First);
511            let mut agg = unsafe { agg_col.agg_first(&self.groups) };
512            agg.rename(new_name);
513            cols.push(agg);
514        }
515        DataFrame::new_infer_height(cols)
516    }
517
518    /// Aggregate grouped `Series` and return the last value per group.
519    ///
520    /// # Example
521    ///
522    /// ```rust
523    /// # use polars_core::prelude::*;
524    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
525    ///     df.group_by(["date"])?.select(["temp"]).last()
526    /// }
527    /// ```
528    /// Returns:
529    ///
530    /// ```text
531    /// +------------+------------+
532    /// | date       | temp_last |
533    /// | ---        | ---        |
534    /// | Date       | i32        |
535    /// +============+============+
536    /// | 2020-08-23 | 9          |
537    /// +------------+------------+
538    /// | 2020-08-22 | 1          |
539    /// +------------+------------+
540    /// | 2020-08-21 | 10         |
541    /// +------------+------------+
542    /// ```
543    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
544    pub fn last(&self) -> PolarsResult<DataFrame> {
545        let (mut cols, agg_cols) = self.prepare_agg()?;
546        for agg_col in agg_cols {
547            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last);
548            let mut agg = unsafe { agg_col.agg_last(&self.groups) };
549            agg.rename(new_name);
550            cols.push(agg);
551        }
552        DataFrame::new_infer_height(cols)
553    }
554
555    /// Aggregate grouped `Series` by counting the number of unique values.
556    ///
557    /// # Example
558    ///
559    /// ```rust
560    /// # use polars_core::prelude::*;
561    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
562    ///     df.group_by(["date"])?.select(["temp"]).n_unique()
563    /// }
564    /// ```
565    /// Returns:
566    ///
567    /// ```text
568    /// +------------+---------------+
569    /// | date       | temp_n_unique |
570    /// | ---        | ---           |
571    /// | Date       | u32           |
572    /// +============+===============+
573    /// | 2020-08-23 | 1             |
574    /// +------------+---------------+
575    /// | 2020-08-22 | 2             |
576    /// +------------+---------------+
577    /// | 2020-08-21 | 2             |
578    /// +------------+---------------+
579    /// ```
580    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
581    pub fn n_unique(&self) -> PolarsResult<DataFrame> {
582        let (mut cols, agg_cols) = self.prepare_agg()?;
583        for agg_col in agg_cols {
584            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique);
585            let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) };
586            agg.rename(new_name);
587            cols.push(agg);
588        }
589        DataFrame::new_infer_height(cols)
590    }
591
592    /// Aggregate grouped [`Series`] and determine the quantile per group.
593    ///
594    /// # Example
595    ///
596    /// ```rust
597    /// # use polars_core::prelude::*;
598    ///
599    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
600    ///     df.group_by(["date"])?.select(["temp"]).quantile(0.2, QuantileMethod::default())
601    /// }
602    /// ```
603    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
604    pub fn quantile(&self, quantile: f64, method: QuantileMethod) -> PolarsResult<DataFrame> {
605        polars_ensure!(
606            (0.0..=1.0).contains(&quantile),
607            ComputeError: "`quantile` should be within 0.0 and 1.0"
608        );
609        let (mut cols, agg_cols) = self.prepare_agg()?;
610        for agg_col in agg_cols {
611            let new_name = fmt_group_by_column(
612                agg_col.name().as_str(),
613                GroupByMethod::Quantile(quantile, method),
614            );
615            let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, method) };
616            agg.rename(new_name);
617            cols.push(agg);
618        }
619        DataFrame::new_infer_height(cols)
620    }
621
622    /// Aggregate grouped [`Series`] and determine the median per group.
623    ///
624    /// # Example
625    ///
626    /// ```rust
627    /// # use polars_core::prelude::*;
628    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
629    ///     df.group_by(["date"])?.select(["temp"]).median()
630    /// }
631    /// ```
632    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
633    pub fn median(&self) -> PolarsResult<DataFrame> {
634        let (mut cols, agg_cols) = self.prepare_agg()?;
635        for agg_col in agg_cols {
636            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median);
637            let mut agg = unsafe { agg_col.agg_median(&self.groups) };
638            agg.rename(new_name);
639            cols.push(agg);
640        }
641        DataFrame::new_infer_height(cols)
642    }
643
644    /// Aggregate grouped [`Series`] and determine the variance per group.
645    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
646    pub fn var(&self, ddof: u8) -> PolarsResult<DataFrame> {
647        let (mut cols, agg_cols) = self.prepare_agg()?;
648        for agg_col in agg_cols {
649            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof));
650            let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) };
651            agg.rename(new_name);
652            cols.push(agg);
653        }
654        DataFrame::new_infer_height(cols)
655    }
656
657    /// Aggregate grouped [`Series`] and determine the standard deviation per group.
658    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
659    pub fn std(&self, ddof: u8) -> PolarsResult<DataFrame> {
660        let (mut cols, agg_cols) = self.prepare_agg()?;
661        for agg_col in agg_cols {
662            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof));
663            let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) };
664            agg.rename(new_name);
665            cols.push(agg);
666        }
667        DataFrame::new_infer_height(cols)
668    }
669
670    /// Aggregate grouped series and compute the number of values per group.
671    ///
672    /// # Example
673    ///
674    /// ```rust
675    /// # use polars_core::prelude::*;
676    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
677    ///     df.group_by(["date"])?.select(["temp"]).count()
678    /// }
679    /// ```
680    /// Returns:
681    ///
682    /// ```text
683    /// +------------+------------+
684    /// | date       | temp_count |
685    /// | ---        | ---        |
686    /// | Date       | u32        |
687    /// +============+============+
688    /// | 2020-08-23 | 1          |
689    /// +------------+------------+
690    /// | 2020-08-22 | 2          |
691    /// +------------+------------+
692    /// | 2020-08-21 | 2          |
693    /// +------------+------------+
694    /// ```
695    pub fn count(&self) -> PolarsResult<DataFrame> {
696        let (mut cols, agg_cols) = self.prepare_agg()?;
697
698        for agg_col in agg_cols {
699            let new_name = fmt_group_by_column(
700                agg_col.name().as_str(),
701                GroupByMethod::Count {
702                    include_nulls: true,
703                },
704            );
705            let mut ca = self.groups.group_count();
706            ca.rename(new_name);
707            cols.push(ca.into_column());
708        }
709        DataFrame::new_infer_height(cols)
710    }
711
712    /// Get the group_by group indexes.
713    ///
714    /// # Example
715    ///
716    /// ```rust
717    /// # use polars_core::prelude::*;
718    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
719    ///     df.group_by(["date"])?.groups()
720    /// }
721    /// ```
722    /// Returns:
723    ///
724    /// ```text
725    /// +--------------+------------+
726    /// | date         | groups     |
727    /// | ---          | ---        |
728    /// | Date(days)   | list [u32] |
729    /// +==============+============+
730    /// | 2020-08-23   | "[3]"      |
731    /// +--------------+------------+
732    /// | 2020-08-22   | "[2, 4]"   |
733    /// +--------------+------------+
734    /// | 2020-08-21   | "[0, 1]"   |
735    /// +--------------+------------+
736    /// ```
737    pub fn groups(&self) -> PolarsResult<DataFrame> {
738        let mut cols = self.keys();
739        let mut column = self.groups.as_list_chunked();
740        let new_name = fmt_group_by_column("", GroupByMethod::Groups);
741        column.rename(new_name);
742        cols.push(column.into_column());
743        DataFrame::new_infer_height(cols)
744    }
745
746    /// Aggregate the groups of the group_by operation into lists.
747    ///
748    /// # Example
749    ///
750    /// ```rust
751    /// # use polars_core::prelude::*;
752    /// fn example(df: DataFrame) -> PolarsResult<DataFrame> {
753    ///     // GroupBy and aggregate to Lists
754    ///     df.group_by(["date"])?.select(["temp"]).agg_list()
755    /// }
756    /// ```
757    /// Returns:
758    ///
759    /// ```text
760    /// +------------+------------------------+
761    /// | date       | temp_agg_list          |
762    /// | ---        | ---                    |
763    /// | Date       | list [i32]             |
764    /// +============+========================+
765    /// | 2020-08-23 | "[Some(9)]"            |
766    /// +------------+------------------------+
767    /// | 2020-08-22 | "[Some(7), Some(1)]"   |
768    /// +------------+------------------------+
769    /// | 2020-08-21 | "[Some(20), Some(10)]" |
770    /// +------------+------------------------+
771    /// ```
772    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
773    pub fn agg_list(&self) -> PolarsResult<DataFrame> {
774        let (mut cols, agg_cols) = self.prepare_agg()?;
775        for agg_col in agg_cols {
776            let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Implode);
777            let mut agg = unsafe { agg_col.agg_list(&self.groups) };
778            agg.rename(new_name);
779            cols.push(agg);
780        }
781        DataFrame::new_infer_height(cols)
782    }
783
784    fn prepare_apply(&self) -> PolarsResult<DataFrame> {
785        if let Some(agg) = &self.selected_agg {
786            if agg.is_empty() {
787                Ok(self.df.clone())
788            } else {
789                let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len());
790                new_cols.extend_from_slice(&self.selected_keys);
791                let cols = self.df.select_to_vec(agg.as_slice())?;
792                new_cols.extend(cols);
793                Ok(unsafe { DataFrame::new_unchecked(self.df.height(), new_cols) })
794            }
795        } else {
796            Ok(self.df.clone())
797        }
798    }
799
800    /// Apply a closure over the groups as a new [`DataFrame`] in parallel.
801    #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
802    pub fn par_apply<F>(&self, f: F) -> PolarsResult<DataFrame>
803    where
804        F: Fn(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
805    {
806        polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'");
807        let df = self.prepare_apply()?;
808        let dfs = self
809            .get_groups()
810            .par_iter()
811            .map(|g| {
812                // SAFETY:
813                // groups are in bounds
814                let sub_df = unsafe { take_df(&df, g) };
815                f(sub_df)
816            })
817            .collect::<PolarsResult<Vec<_>>>()?;
818
819        let mut df = accumulate_dataframes_vertical(dfs)?;
820        df.rechunk_mut_par();
821        Ok(df)
822    }
823
824    /// Apply a closure over the groups as a new [`DataFrame`].
825    pub fn apply<F>(&self, f: F) -> PolarsResult<DataFrame>
826    where
827        F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
828    {
829        self.apply_sliced(None, f, None)
830    }
831
832    pub fn apply_sliced<F>(
833        &self,
834        slice: Option<(i64, usize)>,
835        mut f: F,
836        schema: Option<&SchemaRef>,
837    ) -> PolarsResult<DataFrame>
838    where
839        F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
840    {
841        if self.df.height() == 0 {
842            // return empty dataframe with correct schema
843            if let Some(schema) = schema {
844                return Ok(DataFrame::empty_with_arc_schema(schema.clone()));
845            }
846
847            polars_bail!(ComputeError: "cannot group_by + apply on empty 'DataFrame'");
848        }
849
850        let df = self.prepare_apply()?;
851        let max_height = if let Some((offset, len)) = slice {
852            offset.try_into().unwrap_or(usize::MAX).saturating_add(len)
853        } else {
854            usize::MAX
855        };
856        let mut height = 0;
857        let mut dfs = Vec::with_capacity(self.get_groups().len());
858        for g in self.get_groups().iter() {
859            // SAFETY: groups are in bounds.
860            let sub_df = unsafe { take_df(&df, g) };
861            let df = f(sub_df)?;
862            height += df.height();
863            dfs.push(df);
864
865            // Even if max_height is zero we need at least one df, so check
866            // after first push.
867            if height >= max_height {
868                break;
869            }
870        }
871
872        let mut df = accumulate_dataframes_vertical(dfs)?;
873        if let Some((offset, len)) = slice {
874            df = df.slice(offset, len);
875        }
876        Ok(df)
877    }
878
879    pub fn sliced(mut self, slice: Option<(i64, usize)>) -> Self {
880        match slice {
881            None => self,
882            Some((offset, length)) => {
883                self.groups = self.groups.slice(offset, length);
884                self.selected_keys = self.keys_sliced(slice);
885                self
886            },
887        }
888    }
889}
890
891unsafe fn take_df(df: &DataFrame, g: GroupsIndicator) -> DataFrame {
892    match g {
893        GroupsIndicator::Idx(idx) => df.take_slice_unchecked(idx.1),
894        GroupsIndicator::Slice([first, len]) => df.slice(first as i64, len as usize),
895    }
896}
897
898#[derive(Copy, Clone, Debug)]
899pub enum GroupByMethod {
900    Min,
901    NanMin,
902    Max,
903    NanMax,
904    Median,
905    Mean,
906    First,
907    FirstNonNull,
908    Last,
909    LastNonNull,
910    Item { allow_empty: bool },
911    Sum,
912    Groups,
913    NUnique,
914    Quantile(f64, QuantileMethod),
915    Count { include_nulls: bool },
916    Implode,
917    Std(u8),
918    Var(u8),
919    ArgMin,
920    ArgMax,
921}
922
923impl Display for GroupByMethod {
924    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
925        use GroupByMethod::*;
926        let s = match self {
927            Min => "min",
928            NanMin => "nan_min",
929            Max => "max",
930            NanMax => "nan_max",
931            Median => "median",
932            Mean => "mean",
933            First => "first",
934            FirstNonNull => "first_non_null",
935            Last => "last",
936            LastNonNull => "last_non_null",
937            Item { .. } => "item",
938            Sum => "sum",
939            Groups => "groups",
940            NUnique => "n_unique",
941            Quantile(_, _) => "quantile",
942            Count { .. } => "count",
943            Implode => "list",
944            Std(_) => "std",
945            Var(_) => "var",
946            ArgMin => "arg_min",
947            ArgMax => "arg_max",
948        };
949        write!(f, "{s}")
950    }
951}
952
953// Formatting functions used in eager and lazy code for renaming grouped columns
954pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> PlSmallStr {
955    use GroupByMethod::*;
956    match method {
957        Min => format_pl_smallstr!("{name}_min"),
958        Max => format_pl_smallstr!("{name}_max"),
959        NanMin => format_pl_smallstr!("{name}_nan_min"),
960        NanMax => format_pl_smallstr!("{name}_nan_max"),
961        Median => format_pl_smallstr!("{name}_median"),
962        Mean => format_pl_smallstr!("{name}_mean"),
963        First => format_pl_smallstr!("{name}_first"),
964        FirstNonNull => format_pl_smallstr!("{name}_first_non_null"),
965        Last => format_pl_smallstr!("{name}_last"),
966        LastNonNull => format_pl_smallstr!("{name}_last_non_null"),
967        Item { .. } => format_pl_smallstr!("{name}_item"),
968        Sum => format_pl_smallstr!("{name}_sum"),
969        Groups => PlSmallStr::from_static("groups"),
970        NUnique => format_pl_smallstr!("{name}_n_unique"),
971        Count { .. } => format_pl_smallstr!("{name}_count"),
972        Implode => format_pl_smallstr!("{name}_agg_list"),
973        Quantile(quantile, _interpol) => format_pl_smallstr!("{name}_quantile_{quantile:.2}"),
974        Std(_) => format_pl_smallstr!("{name}_agg_std"),
975        Var(_) => format_pl_smallstr!("{name}_agg_var"),
976        ArgMin => format_pl_smallstr!("{name}_arg_min"),
977        ArgMax => format_pl_smallstr!("{name}_arg_max"),
978    }
979}
980
981#[cfg(test)]
982mod test {
983    use num_traits::FloatConst;
984
985    use crate::prelude::*;
986
987    #[test]
988    #[cfg(feature = "dtype-date")]
989    #[cfg_attr(miri, ignore)]
990    fn test_group_by() -> PolarsResult<()> {
991        let s0 = Column::new(
992            PlSmallStr::from_static("date"),
993            &[
994                "2020-08-21",
995                "2020-08-21",
996                "2020-08-22",
997                "2020-08-23",
998                "2020-08-22",
999            ],
1000        );
1001        let s1 = Column::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]);
1002        let s2 = Column::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]);
1003        let df = DataFrame::new_infer_height(vec![s0, s1, s2]).unwrap();
1004
1005        let out = df.group_by_stable(["date"])?.select(["temp"]).count()?;
1006        assert_eq!(
1007            out.column("temp_count")?,
1008            &Column::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1])
1009        );
1010
1011        // Use of deprecated mean() for testing purposes
1012        #[allow(deprecated)]
1013        // Select multiple
1014        let out = df
1015            .group_by_stable(["date"])?
1016            .select(["temp", "rain"])
1017            .mean()?;
1018        assert_eq!(
1019            out.column("temp_mean")?,
1020            &Column::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0])
1021        );
1022
1023        // Use of deprecated `mean()` for testing purposes
1024        #[allow(deprecated)]
1025        // Group by multiple
1026        let out = df
1027            .group_by_stable(["date", "temp"])?
1028            .select(["rain"])
1029            .mean()?;
1030        assert!(out.column("rain_mean").is_ok());
1031
1032        // Use of deprecated `sum()` for testing purposes
1033        #[allow(deprecated)]
1034        let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?;
1035        assert_eq!(
1036            out.column("temp_sum")?,
1037            &Column::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9])
1038        );
1039
1040        // Use of deprecated `n_unique()` for testing purposes
1041        #[allow(deprecated)]
1042        // implicit select all and only aggregate on methods that support that aggregation
1043        let gb = df.group_by(["date"]).unwrap().n_unique().unwrap();
1044        // check the group by column is filtered out.
1045        assert_eq!(gb.width(), 3);
1046        Ok(())
1047    }
1048
1049    #[test]
1050    #[cfg_attr(miri, ignore)]
1051    fn test_static_group_by_by_12_columns() {
1052        // Build GroupBy DataFrame.
1053        let s0 = Column::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref());
1054        let s1 = Column::new("N".into(), [1, 2, 2, 4, 2].as_ref());
1055        let s2 = Column::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref());
1056        let s3 = Column::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref());
1057        let s4 = Column::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref());
1058        let s5 = Column::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref());
1059        let s6 = Column::new("G6".into(), [false, true, true, true, false].as_ref());
1060        let s7 = Column::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref());
1061        let s8 = Column::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref());
1062        let s9 = Column::new("G9".into(), [1, 2, 3, 3, 4].as_ref());
1063        let s10 = Column::new("G10".into(), [".", "!", "?", "?", "/"].as_ref());
1064        let s11 = Column::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref());
1065        let s12 = Column::new("G12".into(), ["-", "_", ";", ";", ","].as_ref());
1066
1067        let df = DataFrame::new_infer_height(vec![
1068            s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1069        ])
1070        .unwrap();
1071
1072        // Use of deprecated `sum()` for testing purposes
1073        #[allow(deprecated)]
1074        let adf = df
1075            .group_by([
1076                "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
1077            ])
1078            .unwrap()
1079            .select(["N"])
1080            .sum()
1081            .unwrap();
1082
1083        assert_eq!(
1084            Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1085            &[Some(1), Some(2), Some(2), Some(6)]
1086        );
1087    }
1088
1089    #[test]
1090    #[cfg_attr(miri, ignore)]
1091    fn test_dynamic_group_by_by_13_columns() {
1092        // The content for every group_by series.
1093        let series_content = ["A", "A", "B", "B", "C"];
1094
1095        // The name of every group_by series.
1096        let series_names = [
1097            "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
1098        ];
1099
1100        // Vector to contain every series.
1101        let mut columns = Vec::with_capacity(14);
1102
1103        // Create a series for every group name.
1104        for series_name in series_names {
1105            let group_columns = Column::new(series_name.into(), series_content.as_ref());
1106            columns.push(group_columns);
1107        }
1108
1109        // Create a series for the aggregation column.
1110        let agg_series = Column::new("N".into(), [1, 2, 3, 3, 4].as_ref());
1111        columns.push(agg_series);
1112
1113        // Create the dataframe with the computed series.
1114        let df = DataFrame::new_infer_height(columns).unwrap();
1115
1116        // Use of deprecated `sum()` for testing purposes
1117        #[allow(deprecated)]
1118        // Compute the aggregated DataFrame by the 13 columns defined in `series_names`.
1119        let adf = df
1120            .group_by(series_names)
1121            .unwrap()
1122            .select(["N"])
1123            .sum()
1124            .unwrap();
1125
1126        // Check that the results of the group-by are correct. The content of every column
1127        // is equal, then, the grouped columns shall be equal and in the same order.
1128        for series_name in &series_names {
1129            assert_eq!(
1130                Vec::from(&adf.column(series_name).unwrap().str().unwrap().sort(false)),
1131                &[Some("A"), Some("B"), Some("C")]
1132            );
1133        }
1134
1135        // Check the aggregated column is the expected one.
1136        assert_eq!(
1137            Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1138            &[Some(3), Some(4), Some(6)]
1139        );
1140    }
1141
1142    #[test]
1143    #[cfg_attr(miri, ignore)]
1144    fn test_group_by_floats() {
1145        let df = df! {"flt" => [1., 1., 2., 2., 3.],
1146                    "val" => [1, 1, 1, 1, 1]
1147        }
1148        .unwrap();
1149        // Use of deprecated `sum()` for testing purposes
1150        #[allow(deprecated)]
1151        let res = df.group_by(["flt"]).unwrap().sum().unwrap();
1152        let res = res.sort(["flt"], SortMultipleOptions::default()).unwrap();
1153        assert_eq!(
1154            Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
1155            &[Some(2), Some(2), Some(1)]
1156        );
1157    }
1158
1159    #[test]
1160    #[cfg_attr(miri, ignore)]
1161    #[cfg(feature = "dtype-categorical")]
1162    fn test_group_by_categorical() {
1163        let mut df = df! {"foo" => ["a", "a", "b", "b", "c"],
1164                    "ham" => ["a", "a", "b", "b", "c"],
1165                    "bar" => [1, 1, 1, 1, 1]
1166        }
1167        .unwrap();
1168
1169        df.apply("foo", |s| {
1170            s.cast(&DataType::from_categories(Categories::global()))
1171                .unwrap()
1172        })
1173        .unwrap();
1174
1175        // Use of deprecated `sum()` for testing purposes
1176        #[allow(deprecated)]
1177        // check multiple keys and categorical
1178        let res = df
1179            .group_by_stable(["foo", "ham"])
1180            .unwrap()
1181            .select(["bar"])
1182            .sum()
1183            .unwrap();
1184
1185        assert_eq!(
1186            Vec::from(
1187                res.column("bar_sum")
1188                    .unwrap()
1189                    .as_materialized_series()
1190                    .i32()
1191                    .unwrap()
1192            ),
1193            &[Some(2), Some(2), Some(1)]
1194        );
1195    }
1196
1197    #[test]
1198    #[cfg_attr(miri, ignore)]
1199    fn test_group_by_null_handling() -> PolarsResult<()> {
1200        let df = df!(
1201            "a" => ["a", "a", "a", "b", "b"],
1202            "b" => [Some(1), Some(2), None, None, Some(1)]
1203        )?;
1204        // Use of deprecated `mean()` for testing purposes
1205        #[allow(deprecated)]
1206        let out = df.group_by_stable(["a"])?.mean()?;
1207
1208        assert_eq!(
1209            Vec::from(out.column("b_mean")?.as_materialized_series().f64()?),
1210            &[Some(1.5), Some(1.0)]
1211        );
1212        Ok(())
1213    }
1214
1215    #[test]
1216    #[cfg_attr(miri, ignore)]
1217    fn test_group_by_var() -> PolarsResult<()> {
1218        // check variance and proper coercion to f64
1219        let df = df![
1220            "g" => ["foo", "foo", "bar"],
1221            "flt" => [1.0, 2.0, 3.0],
1222            "int" => [1, 2, 3]
1223        ]?;
1224
1225        // Use of deprecated `sum()` for testing purposes
1226        #[allow(deprecated)]
1227        let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?;
1228
1229        assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5));
1230        // Use of deprecated `std()` for testing purposes
1231        #[allow(deprecated)]
1232        let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?;
1233        let val = out.column("int_agg_std")?.f64()?.get(0).unwrap();
1234        let expected = f64::FRAC_1_SQRT_2();
1235        assert!((val - expected).abs() < 0.000001);
1236        Ok(())
1237    }
1238
1239    #[test]
1240    #[cfg_attr(miri, ignore)]
1241    #[cfg(feature = "dtype-categorical")]
1242    fn test_group_by_null_group() -> PolarsResult<()> {
1243        // check if null is own group
1244        let mut df = df![
1245            "g" => [Some("foo"), Some("foo"), Some("bar"), None, None],
1246            "flt" => [1.0, 2.0, 3.0, 1.0, 1.0],
1247            "int" => [1, 2, 3, 1, 1]
1248        ]?;
1249
1250        df.try_apply("g", |s| {
1251            s.cast(&DataType::from_categories(Categories::global()))
1252        })?;
1253
1254        // Use of deprecated `sum()` for testing purposes
1255        #[allow(deprecated)]
1256        let _ = df.group_by(["g"])?.sum()?;
1257        Ok(())
1258    }
1259}
polars_core/frame/group_by/mod.rs

polars_core/frame/group_by/
mod.rs