1use std::fmt::{Debug, Display, Formatter};
2use std::hash::Hash;
3
4use num_traits::NumCast;
5use polars_compute::rolling::QuantileMethod;
6use polars_utils::format_pl_smallstr;
7use polars_utils::hashing::DirtyHash;
8use rayon::prelude::*;
9
10use self::hashing::*;
11use crate::POOL;
12use crate::prelude::*;
13use crate::utils::{_set_partition_size, accumulate_dataframes_vertical};
14
15pub mod aggregations;
16pub mod expr;
17pub(crate) mod hashing;
18mod into_groups;
19mod position;
20
21pub use into_groups::*;
22pub use position::*;
23
24use crate::chunked_array::ops::row_encode::{
25 encode_rows_unordered, encode_rows_vertical_par_unordered,
26};
27
28impl DataFrame {
29 pub fn group_by_with_series(
30 &self,
31 mut by: Vec<Column>,
32 multithreaded: bool,
33 sorted: bool,
34 ) -> PolarsResult<GroupBy<'_>> {
35 polars_ensure!(
36 !by.is_empty(),
37 ComputeError: "at least one key is required in a group_by operation"
38 );
39
40 let common_height = if self.width() > 0 {
44 self.height()
45 } else {
46 by.iter().map(|s| s.len()).max().expect("at least 1 key")
47 };
48 for by_key in by.iter_mut() {
49 if by_key.len() != common_height {
50 polars_ensure!(
51 by_key.len() == 1,
52 ShapeMismatch: "series used as keys should have the same length as the DataFrame"
53 );
54 *by_key = by_key.new_from_index(0, common_height)
55 }
56 }
57
58 let groups = if by.len() == 1 {
59 let column = &by[0];
60 column
61 .as_materialized_series()
62 .group_tuples(multithreaded, sorted)
63 } else if by.iter().any(|s| s.dtype().is_object()) {
64 #[cfg(feature = "object")]
65 {
66 let mut df = DataFrame::new(self.height(), by.clone()).unwrap();
67 let n = df.height();
68 let rows = df.to_av_rows();
69 let iter = (0..n).map(|i| rows.get(i));
70 Ok(group_by(iter, sorted))
71 }
72 #[cfg(not(feature = "object"))]
73 {
74 unreachable!()
75 }
76 } else {
77 let by = by
79 .iter()
80 .filter(|s| !s.dtype().is_null())
81 .cloned()
82 .collect::<Vec<_>>();
83 if by.is_empty() {
84 let groups = if self.height() == 0 {
85 vec![]
86 } else {
87 vec![[0, self.height() as IdxSize]]
88 };
89
90 Ok(GroupsType::new_slice(groups, false, true))
91 } else {
92 let rows = if multithreaded {
93 encode_rows_vertical_par_unordered(&by)
94 } else {
95 encode_rows_unordered(&by)
96 }?
97 .into_series();
98 rows.group_tuples(multithreaded, sorted)
99 }
100 };
101 Ok(GroupBy::new(self, by, groups?.into_sliceable(), None))
102 }
103
104 pub fn group_by<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
117 where
118 I: IntoIterator<Item = S>,
119 S: AsRef<str>,
120 {
121 let selected_keys = self.select_to_vec(by)?;
122 self.group_by_with_series(selected_keys, true, false)
123 }
124
125 pub fn group_by_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
128 where
129 I: IntoIterator<Item = S>,
130 S: AsRef<str>,
131 {
132 let selected_keys = self.select_to_vec(by)?;
133 self.group_by_with_series(selected_keys, true, true)
134 }
135}
136
137#[derive(Debug, Clone)]
187pub struct GroupBy<'a> {
188 pub df: &'a DataFrame,
189 pub(crate) selected_keys: Vec<Column>,
190 groups: GroupPositions,
192 pub(crate) selected_agg: Option<Vec<PlSmallStr>>,
194}
195
196impl<'a> GroupBy<'a> {
197 pub fn new(
198 df: &'a DataFrame,
199 by: Vec<Column>,
200 groups: GroupPositions,
201 selected_agg: Option<Vec<PlSmallStr>>,
202 ) -> Self {
203 GroupBy {
204 df,
205 selected_keys: by,
206 groups,
207 selected_agg,
208 }
209 }
210
211 #[must_use]
217 pub fn select<I: IntoIterator<Item = S>, S: Into<PlSmallStr>>(mut self, selection: I) -> Self {
218 self.selected_agg = Some(selection.into_iter().map(|s| s.into()).collect());
219 self
220 }
221
222 pub fn get_groups(&self) -> &GroupPositions {
227 &self.groups
228 }
229
230 pub unsafe fn get_groups_mut(&mut self) -> &mut GroupPositions {
239 &mut self.groups
240 }
241
242 pub fn into_groups(self) -> GroupPositions {
243 self.groups
244 }
245
246 pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec<Column> {
247 #[allow(unused_assignments)]
248 let mut groups_owned = None;
250
251 let groups = if let Some((offset, len)) = slice {
252 groups_owned = Some(self.groups.slice(offset, len));
253 groups_owned.as_deref().unwrap()
254 } else {
255 &self.groups
256 };
257 POOL.install(|| {
258 self.selected_keys
259 .par_iter()
260 .map(Column::as_materialized_series)
261 .map(|s| {
262 match groups {
263 GroupsType::Idx(groups) => {
264 let mut out = unsafe { s.take_slice_unchecked(groups.first()) };
266 if groups.sorted {
267 out.set_sorted_flag(s.is_sorted_flag());
268 };
269 out
270 },
271 GroupsType::Slice {
272 groups,
273 overlapping,
274 monotonic: _,
275 } => {
276 if *overlapping && !groups.is_empty() {
277 let offset = groups[0][0];
279 let [upper_offset, upper_len] = groups[groups.len() - 1];
280 return s.slice(
281 offset as i64,
282 ((upper_offset + upper_len) - offset) as usize,
283 );
284 }
285
286 let indices = groups
287 .iter()
288 .map(|&[first, _len]| first)
289 .collect_ca(PlSmallStr::EMPTY);
290 let mut out = unsafe { s.take_unchecked(&indices) };
292 out.set_sorted_flag(s.is_sorted_flag());
294 out
295 },
296 }
297 })
298 .map(Column::from)
299 .collect()
300 })
301 }
302
303 pub fn keys(&self) -> Vec<Column> {
304 self.keys_sliced(None)
305 }
306
307 fn prepare_agg(&self) -> PolarsResult<(Vec<Column>, Vec<Column>)> {
308 let keys = self.keys();
309
310 let agg_col = match &self.selected_agg {
311 Some(selection) => self.df.select_to_vec(selection),
312 None => {
313 let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect();
314 let selection = self
315 .df
316 .columns()
317 .iter()
318 .map(|s| s.name())
319 .filter(|a| !by.contains(a))
320 .cloned()
321 .collect::<Vec<_>>();
322
323 self.df.select_to_vec(selection.as_slice())
324 },
325 }?;
326
327 Ok((keys, agg_col))
328 }
329
330 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
356 pub fn mean(&self) -> PolarsResult<DataFrame> {
357 let (mut cols, agg_cols) = self.prepare_agg()?;
358
359 for agg_col in agg_cols {
360 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Mean);
361 let mut agg = unsafe { agg_col.agg_mean(&self.groups) };
362 agg.rename(new_name);
363 cols.push(agg);
364 }
365
366 DataFrame::new_infer_height(cols)
367 }
368
369 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
395 pub fn sum(&self) -> PolarsResult<DataFrame> {
396 let (mut cols, agg_cols) = self.prepare_agg()?;
397
398 for agg_col in agg_cols {
399 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum);
400 let mut agg = unsafe { agg_col.agg_sum(&self.groups) };
401 agg.rename(new_name);
402 cols.push(agg);
403 }
404 DataFrame::new_infer_height(cols)
405 }
406
407 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
433 pub fn min(&self) -> PolarsResult<DataFrame> {
434 let (mut cols, agg_cols) = self.prepare_agg()?;
435 for agg_col in agg_cols {
436 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min);
437 let mut agg = unsafe { agg_col.agg_min(&self.groups) };
438 agg.rename(new_name);
439 cols.push(agg);
440 }
441 DataFrame::new_infer_height(cols)
442 }
443
444 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
470 pub fn max(&self) -> PolarsResult<DataFrame> {
471 let (mut cols, agg_cols) = self.prepare_agg()?;
472 for agg_col in agg_cols {
473 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max);
474 let mut agg = unsafe { agg_col.agg_max(&self.groups) };
475 agg.rename(new_name);
476 cols.push(agg);
477 }
478 DataFrame::new_infer_height(cols)
479 }
480
481 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
507 pub fn first(&self) -> PolarsResult<DataFrame> {
508 let (mut cols, agg_cols) = self.prepare_agg()?;
509 for agg_col in agg_cols {
510 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First);
511 let mut agg = unsafe { agg_col.agg_first(&self.groups) };
512 agg.rename(new_name);
513 cols.push(agg);
514 }
515 DataFrame::new_infer_height(cols)
516 }
517
518 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
544 pub fn last(&self) -> PolarsResult<DataFrame> {
545 let (mut cols, agg_cols) = self.prepare_agg()?;
546 for agg_col in agg_cols {
547 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last);
548 let mut agg = unsafe { agg_col.agg_last(&self.groups) };
549 agg.rename(new_name);
550 cols.push(agg);
551 }
552 DataFrame::new_infer_height(cols)
553 }
554
555 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
581 pub fn n_unique(&self) -> PolarsResult<DataFrame> {
582 let (mut cols, agg_cols) = self.prepare_agg()?;
583 for agg_col in agg_cols {
584 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique);
585 let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) };
586 agg.rename(new_name);
587 cols.push(agg);
588 }
589 DataFrame::new_infer_height(cols)
590 }
591
592 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
604 pub fn quantile(&self, quantile: f64, method: QuantileMethod) -> PolarsResult<DataFrame> {
605 polars_ensure!(
606 (0.0..=1.0).contains(&quantile),
607 ComputeError: "`quantile` should be within 0.0 and 1.0"
608 );
609 let (mut cols, agg_cols) = self.prepare_agg()?;
610 for agg_col in agg_cols {
611 let new_name = fmt_group_by_column(
612 agg_col.name().as_str(),
613 GroupByMethod::Quantile(quantile, method),
614 );
615 let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, method) };
616 agg.rename(new_name);
617 cols.push(agg);
618 }
619 DataFrame::new_infer_height(cols)
620 }
621
622 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
633 pub fn median(&self) -> PolarsResult<DataFrame> {
634 let (mut cols, agg_cols) = self.prepare_agg()?;
635 for agg_col in agg_cols {
636 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median);
637 let mut agg = unsafe { agg_col.agg_median(&self.groups) };
638 agg.rename(new_name);
639 cols.push(agg);
640 }
641 DataFrame::new_infer_height(cols)
642 }
643
644 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
646 pub fn var(&self, ddof: u8) -> PolarsResult<DataFrame> {
647 let (mut cols, agg_cols) = self.prepare_agg()?;
648 for agg_col in agg_cols {
649 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof));
650 let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) };
651 agg.rename(new_name);
652 cols.push(agg);
653 }
654 DataFrame::new_infer_height(cols)
655 }
656
657 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
659 pub fn std(&self, ddof: u8) -> PolarsResult<DataFrame> {
660 let (mut cols, agg_cols) = self.prepare_agg()?;
661 for agg_col in agg_cols {
662 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof));
663 let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) };
664 agg.rename(new_name);
665 cols.push(agg);
666 }
667 DataFrame::new_infer_height(cols)
668 }
669
670 pub fn count(&self) -> PolarsResult<DataFrame> {
696 let (mut cols, agg_cols) = self.prepare_agg()?;
697
698 for agg_col in agg_cols {
699 let new_name = fmt_group_by_column(
700 agg_col.name().as_str(),
701 GroupByMethod::Count {
702 include_nulls: true,
703 },
704 );
705 let mut ca = self.groups.group_count();
706 ca.rename(new_name);
707 cols.push(ca.into_column());
708 }
709 DataFrame::new_infer_height(cols)
710 }
711
712 pub fn groups(&self) -> PolarsResult<DataFrame> {
738 let mut cols = self.keys();
739 let mut column = self.groups.as_list_chunked();
740 let new_name = fmt_group_by_column("", GroupByMethod::Groups);
741 column.rename(new_name);
742 cols.push(column.into_column());
743 DataFrame::new_infer_height(cols)
744 }
745
746 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
773 pub fn agg_list(&self) -> PolarsResult<DataFrame> {
774 let (mut cols, agg_cols) = self.prepare_agg()?;
775 for agg_col in agg_cols {
776 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Implode);
777 let mut agg = unsafe { agg_col.agg_list(&self.groups) };
778 agg.rename(new_name);
779 cols.push(agg);
780 }
781 DataFrame::new_infer_height(cols)
782 }
783
784 fn prepare_apply(&self) -> PolarsResult<DataFrame> {
785 if let Some(agg) = &self.selected_agg {
786 if agg.is_empty() {
787 Ok(self.df.clone())
788 } else {
789 let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len());
790 new_cols.extend_from_slice(&self.selected_keys);
791 let cols = self.df.select_to_vec(agg.as_slice())?;
792 new_cols.extend(cols);
793 Ok(unsafe { DataFrame::new_unchecked(self.df.height(), new_cols) })
794 }
795 } else {
796 Ok(self.df.clone())
797 }
798 }
799
800 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
802 pub fn par_apply<F>(&self, f: F) -> PolarsResult<DataFrame>
803 where
804 F: Fn(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
805 {
806 polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'");
807 let df = self.prepare_apply()?;
808 let dfs = self
809 .get_groups()
810 .par_iter()
811 .map(|g| {
812 let sub_df = unsafe { take_df(&df, g) };
815 f(sub_df)
816 })
817 .collect::<PolarsResult<Vec<_>>>()?;
818
819 let mut df = accumulate_dataframes_vertical(dfs)?;
820 df.rechunk_mut_par();
821 Ok(df)
822 }
823
824 pub fn apply<F>(&self, f: F) -> PolarsResult<DataFrame>
826 where
827 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
828 {
829 self.apply_sliced(None, f, None)
830 }
831
832 pub fn apply_sliced<F>(
833 &self,
834 slice: Option<(i64, usize)>,
835 mut f: F,
836 schema: Option<&SchemaRef>,
837 ) -> PolarsResult<DataFrame>
838 where
839 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
840 {
841 if self.df.height() == 0 {
842 if let Some(schema) = schema {
844 return Ok(DataFrame::empty_with_arc_schema(schema.clone()));
845 }
846
847 polars_bail!(ComputeError: "cannot group_by + apply on empty 'DataFrame'");
848 }
849
850 let df = self.prepare_apply()?;
851 let max_height = if let Some((offset, len)) = slice {
852 offset.try_into().unwrap_or(usize::MAX).saturating_add(len)
853 } else {
854 usize::MAX
855 };
856 let mut height = 0;
857 let mut dfs = Vec::with_capacity(self.get_groups().len());
858 for g in self.get_groups().iter() {
859 let sub_df = unsafe { take_df(&df, g) };
861 let df = f(sub_df)?;
862 height += df.height();
863 dfs.push(df);
864
865 if height >= max_height {
868 break;
869 }
870 }
871
872 let mut df = accumulate_dataframes_vertical(dfs)?;
873 if let Some((offset, len)) = slice {
874 df = df.slice(offset, len);
875 }
876 Ok(df)
877 }
878
879 pub fn sliced(mut self, slice: Option<(i64, usize)>) -> Self {
880 match slice {
881 None => self,
882 Some((offset, length)) => {
883 self.groups = self.groups.slice(offset, length);
884 self.selected_keys = self.keys_sliced(slice);
885 self
886 },
887 }
888 }
889}
890
891unsafe fn take_df(df: &DataFrame, g: GroupsIndicator) -> DataFrame {
892 match g {
893 GroupsIndicator::Idx(idx) => df.take_slice_unchecked(idx.1),
894 GroupsIndicator::Slice([first, len]) => df.slice(first as i64, len as usize),
895 }
896}
897
898#[derive(Copy, Clone, Debug)]
899pub enum GroupByMethod {
900 Min,
901 NanMin,
902 Max,
903 NanMax,
904 Median,
905 Mean,
906 First,
907 FirstNonNull,
908 Last,
909 LastNonNull,
910 Item { allow_empty: bool },
911 Sum,
912 Groups,
913 NUnique,
914 Quantile(f64, QuantileMethod),
915 Count { include_nulls: bool },
916 Implode,
917 Std(u8),
918 Var(u8),
919 ArgMin,
920 ArgMax,
921}
922
923impl Display for GroupByMethod {
924 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
925 use GroupByMethod::*;
926 let s = match self {
927 Min => "min",
928 NanMin => "nan_min",
929 Max => "max",
930 NanMax => "nan_max",
931 Median => "median",
932 Mean => "mean",
933 First => "first",
934 FirstNonNull => "first_non_null",
935 Last => "last",
936 LastNonNull => "last_non_null",
937 Item { .. } => "item",
938 Sum => "sum",
939 Groups => "groups",
940 NUnique => "n_unique",
941 Quantile(_, _) => "quantile",
942 Count { .. } => "count",
943 Implode => "list",
944 Std(_) => "std",
945 Var(_) => "var",
946 ArgMin => "arg_min",
947 ArgMax => "arg_max",
948 };
949 write!(f, "{s}")
950 }
951}
952
953pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> PlSmallStr {
955 use GroupByMethod::*;
956 match method {
957 Min => format_pl_smallstr!("{name}_min"),
958 Max => format_pl_smallstr!("{name}_max"),
959 NanMin => format_pl_smallstr!("{name}_nan_min"),
960 NanMax => format_pl_smallstr!("{name}_nan_max"),
961 Median => format_pl_smallstr!("{name}_median"),
962 Mean => format_pl_smallstr!("{name}_mean"),
963 First => format_pl_smallstr!("{name}_first"),
964 FirstNonNull => format_pl_smallstr!("{name}_first_non_null"),
965 Last => format_pl_smallstr!("{name}_last"),
966 LastNonNull => format_pl_smallstr!("{name}_last_non_null"),
967 Item { .. } => format_pl_smallstr!("{name}_item"),
968 Sum => format_pl_smallstr!("{name}_sum"),
969 Groups => PlSmallStr::from_static("groups"),
970 NUnique => format_pl_smallstr!("{name}_n_unique"),
971 Count { .. } => format_pl_smallstr!("{name}_count"),
972 Implode => format_pl_smallstr!("{name}_agg_list"),
973 Quantile(quantile, _interpol) => format_pl_smallstr!("{name}_quantile_{quantile:.2}"),
974 Std(_) => format_pl_smallstr!("{name}_agg_std"),
975 Var(_) => format_pl_smallstr!("{name}_agg_var"),
976 ArgMin => format_pl_smallstr!("{name}_arg_min"),
977 ArgMax => format_pl_smallstr!("{name}_arg_max"),
978 }
979}
980
981#[cfg(test)]
982mod test {
983 use num_traits::FloatConst;
984
985 use crate::prelude::*;
986
987 #[test]
988 #[cfg(feature = "dtype-date")]
989 #[cfg_attr(miri, ignore)]
990 fn test_group_by() -> PolarsResult<()> {
991 let s0 = Column::new(
992 PlSmallStr::from_static("date"),
993 &[
994 "2020-08-21",
995 "2020-08-21",
996 "2020-08-22",
997 "2020-08-23",
998 "2020-08-22",
999 ],
1000 );
1001 let s1 = Column::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]);
1002 let s2 = Column::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]);
1003 let df = DataFrame::new_infer_height(vec![s0, s1, s2]).unwrap();
1004
1005 let out = df.group_by_stable(["date"])?.select(["temp"]).count()?;
1006 assert_eq!(
1007 out.column("temp_count")?,
1008 &Column::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1])
1009 );
1010
1011 #[allow(deprecated)]
1013 let out = df
1015 .group_by_stable(["date"])?
1016 .select(["temp", "rain"])
1017 .mean()?;
1018 assert_eq!(
1019 out.column("temp_mean")?,
1020 &Column::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0])
1021 );
1022
1023 #[allow(deprecated)]
1025 let out = df
1027 .group_by_stable(["date", "temp"])?
1028 .select(["rain"])
1029 .mean()?;
1030 assert!(out.column("rain_mean").is_ok());
1031
1032 #[allow(deprecated)]
1034 let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?;
1035 assert_eq!(
1036 out.column("temp_sum")?,
1037 &Column::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9])
1038 );
1039
1040 #[allow(deprecated)]
1042 let gb = df.group_by(["date"]).unwrap().n_unique().unwrap();
1044 assert_eq!(gb.width(), 3);
1046 Ok(())
1047 }
1048
1049 #[test]
1050 #[cfg_attr(miri, ignore)]
1051 fn test_static_group_by_by_12_columns() {
1052 let s0 = Column::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref());
1054 let s1 = Column::new("N".into(), [1, 2, 2, 4, 2].as_ref());
1055 let s2 = Column::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref());
1056 let s3 = Column::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref());
1057 let s4 = Column::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref());
1058 let s5 = Column::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref());
1059 let s6 = Column::new("G6".into(), [false, true, true, true, false].as_ref());
1060 let s7 = Column::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref());
1061 let s8 = Column::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref());
1062 let s9 = Column::new("G9".into(), [1, 2, 3, 3, 4].as_ref());
1063 let s10 = Column::new("G10".into(), [".", "!", "?", "?", "/"].as_ref());
1064 let s11 = Column::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref());
1065 let s12 = Column::new("G12".into(), ["-", "_", ";", ";", ","].as_ref());
1066
1067 let df = DataFrame::new_infer_height(vec![
1068 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1069 ])
1070 .unwrap();
1071
1072 #[allow(deprecated)]
1074 let adf = df
1075 .group_by([
1076 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
1077 ])
1078 .unwrap()
1079 .select(["N"])
1080 .sum()
1081 .unwrap();
1082
1083 assert_eq!(
1084 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1085 &[Some(1), Some(2), Some(2), Some(6)]
1086 );
1087 }
1088
1089 #[test]
1090 #[cfg_attr(miri, ignore)]
1091 fn test_dynamic_group_by_by_13_columns() {
1092 let series_content = ["A", "A", "B", "B", "C"];
1094
1095 let series_names = [
1097 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
1098 ];
1099
1100 let mut columns = Vec::with_capacity(14);
1102
1103 for series_name in series_names {
1105 let group_columns = Column::new(series_name.into(), series_content.as_ref());
1106 columns.push(group_columns);
1107 }
1108
1109 let agg_series = Column::new("N".into(), [1, 2, 3, 3, 4].as_ref());
1111 columns.push(agg_series);
1112
1113 let df = DataFrame::new_infer_height(columns).unwrap();
1115
1116 #[allow(deprecated)]
1118 let adf = df
1120 .group_by(series_names)
1121 .unwrap()
1122 .select(["N"])
1123 .sum()
1124 .unwrap();
1125
1126 for series_name in &series_names {
1129 assert_eq!(
1130 Vec::from(&adf.column(series_name).unwrap().str().unwrap().sort(false)),
1131 &[Some("A"), Some("B"), Some("C")]
1132 );
1133 }
1134
1135 assert_eq!(
1137 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1138 &[Some(3), Some(4), Some(6)]
1139 );
1140 }
1141
1142 #[test]
1143 #[cfg_attr(miri, ignore)]
1144 fn test_group_by_floats() {
1145 let df = df! {"flt" => [1., 1., 2., 2., 3.],
1146 "val" => [1, 1, 1, 1, 1]
1147 }
1148 .unwrap();
1149 #[allow(deprecated)]
1151 let res = df.group_by(["flt"]).unwrap().sum().unwrap();
1152 let res = res.sort(["flt"], SortMultipleOptions::default()).unwrap();
1153 assert_eq!(
1154 Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
1155 &[Some(2), Some(2), Some(1)]
1156 );
1157 }
1158
1159 #[test]
1160 #[cfg_attr(miri, ignore)]
1161 #[cfg(feature = "dtype-categorical")]
1162 fn test_group_by_categorical() {
1163 let mut df = df! {"foo" => ["a", "a", "b", "b", "c"],
1164 "ham" => ["a", "a", "b", "b", "c"],
1165 "bar" => [1, 1, 1, 1, 1]
1166 }
1167 .unwrap();
1168
1169 df.apply("foo", |s| {
1170 s.cast(&DataType::from_categories(Categories::global()))
1171 .unwrap()
1172 })
1173 .unwrap();
1174
1175 #[allow(deprecated)]
1177 let res = df
1179 .group_by_stable(["foo", "ham"])
1180 .unwrap()
1181 .select(["bar"])
1182 .sum()
1183 .unwrap();
1184
1185 assert_eq!(
1186 Vec::from(
1187 res.column("bar_sum")
1188 .unwrap()
1189 .as_materialized_series()
1190 .i32()
1191 .unwrap()
1192 ),
1193 &[Some(2), Some(2), Some(1)]
1194 );
1195 }
1196
1197 #[test]
1198 #[cfg_attr(miri, ignore)]
1199 fn test_group_by_null_handling() -> PolarsResult<()> {
1200 let df = df!(
1201 "a" => ["a", "a", "a", "b", "b"],
1202 "b" => [Some(1), Some(2), None, None, Some(1)]
1203 )?;
1204 #[allow(deprecated)]
1206 let out = df.group_by_stable(["a"])?.mean()?;
1207
1208 assert_eq!(
1209 Vec::from(out.column("b_mean")?.as_materialized_series().f64()?),
1210 &[Some(1.5), Some(1.0)]
1211 );
1212 Ok(())
1213 }
1214
1215 #[test]
1216 #[cfg_attr(miri, ignore)]
1217 fn test_group_by_var() -> PolarsResult<()> {
1218 let df = df![
1220 "g" => ["foo", "foo", "bar"],
1221 "flt" => [1.0, 2.0, 3.0],
1222 "int" => [1, 2, 3]
1223 ]?;
1224
1225 #[allow(deprecated)]
1227 let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?;
1228
1229 assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5));
1230 #[allow(deprecated)]
1232 let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?;
1233 let val = out.column("int_agg_std")?.f64()?.get(0).unwrap();
1234 let expected = f64::FRAC_1_SQRT_2();
1235 assert!((val - expected).abs() < 0.000001);
1236 Ok(())
1237 }
1238
1239 #[test]
1240 #[cfg_attr(miri, ignore)]
1241 #[cfg(feature = "dtype-categorical")]
1242 fn test_group_by_null_group() -> PolarsResult<()> {
1243 let mut df = df![
1245 "g" => [Some("foo"), Some("foo"), Some("bar"), None, None],
1246 "flt" => [1.0, 2.0, 3.0, 1.0, 1.0],
1247 "int" => [1, 2, 3, 1, 1]
1248 ]?;
1249
1250 df.try_apply("g", |s| {
1251 s.cast(&DataType::from_categories(Categories::global()))
1252 })?;
1253
1254 #[allow(deprecated)]
1256 let _ = df.group_by(["g"])?.sum()?;
1257 Ok(())
1258 }
1259}