use super::hash_join::prepare_hashed_relation;
use crate::chunked_array::{builder::PrimitiveChunkedBuilder, float::IntegerDecode};
use crate::frame::select::Selection;
use crate::prelude::*;
use crate::utils::{IntoDynamicZip, Xob};
use ahash::RandomState;
use arrow::array::{PrimitiveBuilder, StringBuilder};
use itertools::Itertools;
use num::{Num, NumCast, ToPrimitive, Zero};
use rayon::prelude::*;
use std::collections::{HashMap, HashSet};
use std::hash::Hash;
use std::{
fmt::{Debug, Formatter},
ops::Add,
};
fn groupby<T>(a: impl Iterator<Item = T>) -> Vec<(usize, Vec<usize>)>
where
T: Hash + Eq,
{
let hash_tbl = prepare_hashed_relation(a);
hash_tbl
.into_iter()
.map(|(_, indexes)| {
let first = unsafe { *indexes.get_unchecked(0) };
(first, indexes)
})
.collect()
}
pub trait IntoGroupTuples {
fn group_tuples(&self) -> Vec<(usize, Vec<usize>)> {
unimplemented!()
}
}
fn group_tuples<'a, T>(ca: &'a ChunkedArray<T>) -> Vec<(usize, Vec<usize>)>
where
&'a ChunkedArray<T>: IntoNoNullIterator + IntoIterator,
<&'a ChunkedArray<T> as IntoIterator>::Item: Eq + Hash,
<&'a ChunkedArray<T> as IntoNoNullIterator>::Item: Eq + Hash,
{
if ca.null_count() == 0 {
groupby(ca.into_no_null_iter())
} else {
groupby(ca.into_iter())
}
}
impl<T> IntoGroupTuples for ChunkedArray<T>
where
T: PolarsIntegerType,
T::Native: Eq + Hash,
{
fn group_tuples(&self) -> Vec<(usize, Vec<usize>)> {
group_tuples(self)
}
}
impl IntoGroupTuples for BooleanChunked {
fn group_tuples(&self) -> Vec<(usize, Vec<usize>)> {
group_tuples(self)
}
}
impl IntoGroupTuples for Utf8Chunked {
fn group_tuples(&self) -> Vec<(usize, Vec<usize>)> {
group_tuples(self)
}
}
impl IntoGroupTuples for Float64Chunked {
fn group_tuples(&self) -> Vec<(usize, Vec<usize>)> {
match self.null_count() {
0 => groupby(self.into_no_null_iter().map(|v| v.integer_decode())),
_ => groupby(
self.into_iter()
.map(|opt_v| opt_v.map(|v| v.integer_decode())),
),
}
}
}
impl IntoGroupTuples for Float32Chunked {
fn group_tuples(&self) -> Vec<(usize, Vec<usize>)> {
match self.null_count() {
0 => groupby(self.into_no_null_iter().map(|v| v.integer_decode())),
_ => groupby(
self.into_iter()
.map(|opt_v| opt_v.map(|v| v.integer_decode())),
),
}
}
}
impl IntoGroupTuples for ListChunked {}
#[cfg(feature = "object")]
impl<T> IntoGroupTuples for ObjectChunked<T> {}
#[derive(Copy, Clone, Hash, Eq, PartialEq)]
enum Groupable<'a> {
Boolean(bool),
Utf8(&'a str),
UInt8(u8),
UInt16(u16),
UInt32(u32),
UInt64(u64),
Int8(i8),
Int16(i16),
Int32(i32),
Int64(i64),
Float32(u64, i16, i8),
Float64(u64, i16, i8),
}
impl<'a> Debug for Groupable<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
use Groupable::*;
match self {
Boolean(v) => write!(f, "{}", v),
Utf8(v) => write!(f, "{}", v),
UInt8(v) => write!(f, "{}", v),
UInt16(v) => write!(f, "{}", v),
UInt32(v) => write!(f, "{}", v),
UInt64(v) => write!(f, "{}", v),
Int8(v) => write!(f, "{}", v),
Int16(v) => write!(f, "{}", v),
Int32(v) => write!(f, "{}", v),
Int64(v) => write!(f, "{}", v),
Float32(m, e, s) => write!(f, "float32 mantissa: {} exponent: {} sign: {}", m, e, s),
Float64(m, e, s) => write!(f, "float64 mantissa: {} exponent: {} sign: {}", m, e, s),
}
}
}
impl From<f64> for Groupable<'_> {
fn from(v: f64) -> Self {
let (m, e, s) = v.integer_decode();
Groupable::Float64(m, e, s)
}
}
impl From<f32> for Groupable<'_> {
fn from(v: f32) -> Self {
let (m, e, s) = v.integer_decode();
Groupable::Float64(m, e, s)
}
}
fn float_to_groupable_iter<'a, T>(
ca: &'a ChunkedArray<T>,
) -> Box<dyn Iterator<Item = Option<Groupable>> + 'a>
where
T: PolarsNumericType,
T::Native: Into<Groupable<'a>>,
{
let iter = ca.into_iter().map(|opt_v| opt_v.map(|v| v.into()));
Box::new(iter)
}
impl<'b> (dyn SeriesTrait + 'b) {
fn as_groupable_iter<'a>(&'a self) -> Result<Box<dyn Iterator<Item = Option<Groupable>> + 'a>> {
macro_rules! as_groupable_iter {
($ca:expr, $variant:ident ) => {{
let bx = Box::new($ca.into_iter().map(|opt_b| opt_b.map(Groupable::$variant)));
Ok(bx)
}};
}
match self.dtype() {
ArrowDataType::Boolean => as_groupable_iter!(self.bool().unwrap(), Boolean),
ArrowDataType::UInt8 => as_groupable_iter!(self.u8().unwrap(), UInt8),
ArrowDataType::UInt16 => as_groupable_iter!(self.u16().unwrap(), UInt16),
ArrowDataType::UInt32 => as_groupable_iter!(self.u32().unwrap(), UInt32),
ArrowDataType::UInt64 => as_groupable_iter!(self.u64().unwrap(), UInt64),
ArrowDataType::Int8 => as_groupable_iter!(self.i8().unwrap(), Int8),
ArrowDataType::Int16 => as_groupable_iter!(self.i16().unwrap(), Int16),
ArrowDataType::Int32 => as_groupable_iter!(self.i32().unwrap(), Int32),
ArrowDataType::Int64 => as_groupable_iter!(self.i64().unwrap(), Int64),
ArrowDataType::Date32(DateUnit::Day) => {
as_groupable_iter!(self.date32().unwrap(), Int32)
}
ArrowDataType::Date64(DateUnit::Millisecond) => {
as_groupable_iter!(self.date64().unwrap(), Int64)
}
ArrowDataType::Time64(TimeUnit::Nanosecond) => {
as_groupable_iter!(self.time64_nanosecond().unwrap(), Int64)
}
ArrowDataType::Duration(TimeUnit::Nanosecond) => {
as_groupable_iter!(self.duration_nanosecond().unwrap(), Int64)
}
ArrowDataType::Duration(TimeUnit::Millisecond) => {
as_groupable_iter!(self.duration_millisecond().unwrap(), Int64)
}
#[cfg(feature = "dtype-interval")]
ArrowDataType::Interval(IntervalUnit::DayTime) => {
as_groupable_iter!(self.interval_daytime().unwrap(), Int64)
}
#[cfg(feature = "dtype-interval")]
ArrowDataType::Interval(IntervalUnit::YearMonth) => {
as_groupable_iter!(self.interval_year_month().unwrap(), Int32)
}
ArrowDataType::Utf8 => as_groupable_iter!(self.utf8().unwrap(), Utf8),
ArrowDataType::Float32 => Ok(float_to_groupable_iter(self.f32().unwrap())),
ArrowDataType::Float64 => Ok(float_to_groupable_iter(self.f64().unwrap())),
dt => Err(PolarsError::Other(
format!("Column with dtype {:?} is not groupable", dt).into(),
)),
}
}
}
impl DataFrame {
pub fn groupby<'g, J, S: Selection<'g, J>>(&self, by: S) -> Result<GroupBy> {
macro_rules! static_zip {
($selected_keys:ident, 0) => {
$selected_keys[0].as_groupable_iter()?
};
($selected_keys:ident, 1) => {
static_zip!($selected_keys, 0).zip($selected_keys[1].as_groupable_iter()?)
};
($selected_keys:ident, 2) => {
static_zip!($selected_keys, 1).zip($selected_keys[2].as_groupable_iter()?)
};
($selected_keys:ident, 3) => {
static_zip!($selected_keys, 2).zip($selected_keys[3].as_groupable_iter()?)
};
($selected_keys:ident, 4) => {
static_zip!($selected_keys, 3).zip($selected_keys[4].as_groupable_iter()?)
};
($selected_keys:ident, 5) => {
static_zip!($selected_keys, 4).zip($selected_keys[5].as_groupable_iter()?)
};
($selected_keys:ident, 6) => {
static_zip!($selected_keys, 5).zip($selected_keys[6].as_groupable_iter()?)
};
($selected_keys:ident, 7) => {
static_zip!($selected_keys, 6).zip($selected_keys[7].as_groupable_iter()?)
};
($selected_keys:ident, 8) => {
static_zip!($selected_keys, 7).zip($selected_keys[8].as_groupable_iter()?)
};
($selected_keys:ident, 9) => {
static_zip!($selected_keys, 8).zip($selected_keys[9].as_groupable_iter()?)
};
($selected_keys:ident, 10) => {
static_zip!($selected_keys, 9).zip($selected_keys[10].as_groupable_iter()?)
};
($selected_keys:ident, 11) => {
static_zip!($selected_keys, 10).zip($selected_keys[11].as_groupable_iter()?)
};
}
let selected_keys = self.select_series(by)?;
let groups = match selected_keys.len() {
1 => {
let series = &selected_keys[0];
series.group_tuples()
}
2 => groupby(static_zip!(selected_keys, 1)),
3 => groupby(static_zip!(selected_keys, 2)),
4 => groupby(static_zip!(selected_keys, 3)),
5 => groupby(static_zip!(selected_keys, 4)),
6 => groupby(static_zip!(selected_keys, 5)),
7 => groupby(static_zip!(selected_keys, 6)),
8 => groupby(static_zip!(selected_keys, 7)),
9 => groupby(static_zip!(selected_keys, 8)),
10 => groupby(static_zip!(selected_keys, 9)),
11 => groupby(static_zip!(selected_keys, 10)),
12 => groupby(static_zip!(selected_keys, 11)),
_ => {
let iter = selected_keys
.iter()
.map(|sk| sk.as_groupable_iter())
.collect::<Result<Vec<_>>>()?
.into_dynamic_zip();
groupby(iter)
}
};
Ok(GroupBy {
df: self,
selected_keys,
groups,
selected_agg: None,
})
}
}
#[derive(Debug, Clone)]
pub struct GroupBy<'df, 'selection_str> {
df: &'df DataFrame,
selected_keys: Vec<Series>,
pub(crate) groups: Vec<(usize, Vec<usize>)>,
selected_agg: Option<Vec<&'selection_str str>>,
}
pub(crate) trait NumericAggSync {
fn agg_mean(&self, _groups: &[(usize, Vec<usize>)]) -> Option<Series> {
None
}
fn agg_min(&self, _groups: &[(usize, Vec<usize>)]) -> Option<Series> {
None
}
fn agg_max(&self, _groups: &[(usize, Vec<usize>)]) -> Option<Series> {
None
}
fn agg_sum(&self, _groups: &[(usize, Vec<usize>)]) -> Option<Series> {
None
}
}
impl NumericAggSync for BooleanChunked {}
impl NumericAggSync for Utf8Chunked {}
impl NumericAggSync for ListChunked {}
#[cfg(feature = "object")]
impl<T> NumericAggSync for ObjectChunked<T> {}
impl<T> NumericAggSync for ChunkedArray<T>
where
T: PolarsNumericType + Sync,
T::Native: std::ops::Add<Output = T::Native> + Num + NumCast,
ChunkedArray<T>: IntoSeries,
{
fn agg_mean(&self, groups: &[(usize, Vec<usize>)]) -> Option<Series> {
let ca: Float64Chunked = groups
.par_iter()
.map(|(_first, idx)| {
if let Ok(slice) = self.cont_slice() {
let mut sum = 0.;
for i in idx {
sum += slice[*i].to_f64().unwrap()
}
Some(sum / idx.len() as f64)
} else {
let take =
unsafe { self.take_unchecked(idx.iter().copied(), Some(self.len())) };
let opt_sum: Option<T::Native> = take.sum();
opt_sum.map(|sum| sum.to_f64().unwrap() / idx.len() as f64)
}
})
.collect();
Some(ca.into_series())
}
fn agg_min(&self, groups: &[(usize, Vec<usize>)]) -> Option<Series> {
Some(
groups
.par_iter()
.map(|(_first, idx)| {
if let Ok(slice) = self.cont_slice() {
let mut min = None;
for i in idx {
let v = slice[*i];
min = match min {
Some(min) => {
if min < v {
Some(min)
} else {
Some(v)
}
}
None => Some(v),
};
}
min
} else {
let take =
unsafe { self.take_unchecked(idx.iter().copied(), Some(self.len())) };
take.min()
}
})
.collect::<ChunkedArray<T>>()
.into_series(),
)
}
fn agg_max(&self, groups: &[(usize, Vec<usize>)]) -> Option<Series> {
Some(
groups
.par_iter()
.map(|(_first, idx)| {
if let Ok(slice) = self.cont_slice() {
let mut max = None;
for i in idx {
let v = slice[*i];
max = match max {
Some(max) => {
if max > v {
Some(max)
} else {
Some(v)
}
}
None => Some(v),
};
}
max
} else {
let take =
unsafe { self.take_unchecked(idx.iter().copied(), Some(self.len())) };
take.max()
}
})
.collect::<ChunkedArray<T>>()
.into_series(),
)
}
fn agg_sum(&self, groups: &[(usize, Vec<usize>)]) -> Option<Series> {
Some(
groups
.par_iter()
.map(|(_first, idx)| {
if let Ok(slice) = self.cont_slice() {
let mut sum = Zero::zero();
for i in idx {
sum = sum + slice[*i]
}
Some(sum)
} else {
let take =
unsafe { self.take_unchecked(idx.iter().copied(), Some(self.len())) };
take.sum()
}
})
.collect::<ChunkedArray<T>>()
.into_series(),
)
}
}
pub(crate) trait AggFirst {
fn agg_first(&self, _groups: &[(usize, Vec<usize>)]) -> Series;
}
macro_rules! impl_agg_first {
($self:ident, $groups:ident, $ca_type:ty) => {{
$groups
.iter()
.map(|(first, _idx)| $self.get(*first))
.collect::<$ca_type>()
.into_series()
}};
}
impl<T> AggFirst for ChunkedArray<T>
where
T: PolarsPrimitiveType + Send,
ChunkedArray<T>: IntoSeries,
{
fn agg_first(&self, groups: &[(usize, Vec<usize>)]) -> Series {
impl_agg_first!(self, groups, ChunkedArray<T>)
}
}
impl AggFirst for Utf8Chunked {
fn agg_first(&self, groups: &[(usize, Vec<usize>)]) -> Series {
impl_agg_first!(self, groups, Utf8Chunked)
}
}
impl AggFirst for ListChunked {
fn agg_first(&self, groups: &[(usize, Vec<usize>)]) -> Series {
impl_agg_first!(self, groups, ListChunked)
}
}
#[cfg(feature = "object")]
impl<T> AggFirst for ObjectChunked<T> {
fn agg_first(&self, _groups: &[(usize, Vec<usize>)]) -> Series {
todo!()
}
}
pub(crate) trait AggLast {
fn agg_last(&self, _groups: &[(usize, Vec<usize>)]) -> Series;
}
macro_rules! impl_agg_last {
($self:ident, $groups:ident, $ca_type:ty) => {{
$groups
.iter()
.map(|(_first, idx)| $self.get(idx[idx.len() - 1]))
.collect::<$ca_type>()
.into_series()
}};
}
impl<T> AggLast for ChunkedArray<T>
where
T: PolarsPrimitiveType + Send,
ChunkedArray<T>: IntoSeries,
{
fn agg_last(&self, groups: &[(usize, Vec<usize>)]) -> Series {
impl_agg_last!(self, groups, ChunkedArray<T>)
}
}
impl AggLast for Utf8Chunked {
fn agg_last(&self, groups: &[(usize, Vec<usize>)]) -> Series {
impl_agg_last!(self, groups, Utf8Chunked)
}
}
impl AggLast for ListChunked {
fn agg_last(&self, groups: &[(usize, Vec<usize>)]) -> Series {
impl_agg_last!(self, groups, ListChunked)
}
}
#[cfg(feature = "object")]
impl<T> AggLast for ObjectChunked<T> {
fn agg_last(&self, _groups: &[(usize, Vec<usize>)]) -> Series {
todo!()
}
}
pub(crate) trait AggNUnique {
fn agg_n_unique(&self, _groups: &[(usize, Vec<usize>)]) -> Option<UInt32Chunked> {
None
}
}
macro_rules! impl_agg_n_unique {
($self:ident, $groups:ident, $ca_type:ty) => {{
$groups
.into_par_iter()
.map(|(_first, idx)| {
if $self.null_count() == 0 {
let mut set = HashSet::with_hasher(RandomState::new());
for i in idx {
let v = unsafe { $self.get_unchecked(*i) };
set.insert(v);
}
set.len() as u32
} else {
let mut set = HashSet::with_hasher(RandomState::new());
for i in idx {
let opt_v = $self.get(*i);
set.insert(opt_v);
}
set.len() as u32
}
})
.collect::<$ca_type>()
.into_inner()
}};
}
impl<T> AggNUnique for ChunkedArray<T>
where
T: PolarsIntegerType + Sync,
T::Native: Hash + Eq,
{
fn agg_n_unique(&self, groups: &[(usize, Vec<usize>)]) -> Option<UInt32Chunked> {
Some(impl_agg_n_unique!(self, groups, Xob<UInt32Chunked>))
}
}
impl AggNUnique for Float32Chunked {}
impl AggNUnique for Float64Chunked {}
impl AggNUnique for ListChunked {}
#[cfg(feature = "object")]
impl<T> AggNUnique for ObjectChunked<T> {}
impl AggNUnique for BooleanChunked {
fn agg_n_unique(&self, groups: &[(usize, Vec<usize>)]) -> Option<UInt32Chunked> {
Some(impl_agg_n_unique!(self, groups, Xob<UInt32Chunked>))
}
}
impl AggNUnique for Utf8Chunked {
fn agg_n_unique(&self, groups: &[(usize, Vec<usize>)]) -> Option<UInt32Chunked> {
Some(impl_agg_n_unique!(self, groups, Xob<UInt32Chunked>))
}
}
pub(crate) trait AggList {
fn agg_list(&self, _groups: &[(usize, Vec<usize>)]) -> Option<Series> {
None
}
}
impl<T> AggList for ChunkedArray<T>
where
T: PolarsDataType,
ChunkedArray<T>: IntoSeries,
{
fn agg_list(&self, groups: &[(usize, Vec<usize>)]) -> Option<Series> {
macro_rules! impl_gb {
($type:ty, $agg_col:expr) => {{
let values_builder = PrimitiveBuilder::<$type>::new(groups.len());
let mut builder =
ListPrimitiveChunkedBuilder::new("", values_builder, groups.len());
for (_first, idx) in groups {
let s = unsafe {
$agg_col.take_iter_unchecked(&mut idx.into_iter().copied(), Some(idx.len()))
};
builder.append_opt_series(Some(&s))
}
builder.finish().into_series()
}};
}
macro_rules! impl_gb_utf8 {
($agg_col:expr) => {{
let values_builder = StringBuilder::new(groups.len());
let mut builder = ListUtf8ChunkedBuilder::new("", values_builder, groups.len());
for (_first, idx) in groups {
let s = unsafe {
$agg_col.take_iter_unchecked(&mut idx.into_iter().copied(), Some(idx.len()))
};
builder.append_series(&s)
}
builder.finish().into_series()
}};
}
let s = self.clone().into_series();
Some(match_arrow_data_type_apply_macro!(
s.dtype(),
impl_gb,
impl_gb_utf8,
s
))
}
}
pub(crate) trait AggQuantile {
fn agg_quantile(&self, _groups: &[(usize, Vec<usize>)], _quantile: f64) -> Option<Series> {
None
}
fn agg_median(&self, groups: &[(usize, Vec<usize>)]) -> Option<Series> {
self.agg_quantile(groups, 0.5)
}
}
impl<T> AggQuantile for ChunkedArray<T>
where
T: PolarsNumericType + Sync,
T::Native: PartialEq,
ChunkedArray<T>: IntoSeries,
{
fn agg_quantile(&self, groups: &[(usize, Vec<usize>)], quantile: f64) -> Option<Series> {
Some(
groups
.into_par_iter()
.map(|(_first, idx)| {
let group_vals =
unsafe { self.take_unchecked(idx.iter().copied(), Some(idx.len())) };
let sorted_idx = group_vals.argsort(false);
let quant_idx = (quantile * (sorted_idx.len() - 1) as f64) as usize;
let value_idx = sorted_idx[quant_idx];
group_vals.get(value_idx)
})
.collect::<ChunkedArray<T>>()
.into_series(),
)
}
}
impl AggQuantile for Utf8Chunked {}
impl AggQuantile for BooleanChunked {}
impl AggQuantile for ListChunked {}
#[cfg(feature = "object")]
impl<T> AggQuantile for ObjectChunked<T> {}
impl<'df, 'selection_str> GroupBy<'df, 'selection_str> {
pub fn select<S, J>(mut self, selection: S) -> Self
where
S: Selection<'selection_str, J>,
{
self.selected_agg = Some(selection.to_selection_vec());
self
}
pub fn get_groups(&self) -> &Vec<(usize, Vec<usize>)> {
&self.groups
}
pub(crate) fn keys(&self) -> Vec<Series> {
let size;
if let Some(sel) = &self.selected_agg {
size = sel.len() + self.selected_keys.len();
} else {
size = self.selected_keys.len();
}
let mut keys = Vec::with_capacity(size);
unsafe {
self.selected_keys.iter().for_each(|s| {
let key = s.take_iter_unchecked(
&mut self.groups.iter().map(|(idx, _)| *idx),
Some(self.groups.len()),
);
keys.push(key)
});
}
keys
}
fn prepare_agg(&self) -> Result<(Vec<Series>, Vec<Series>)> {
let selection = match &self.selected_agg {
Some(selection) => selection.clone(),
None => {
let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect();
self.df
.get_column_names()
.into_iter()
.filter(|a| !by.contains(a))
.collect()
}
};
let keys = self.keys();
let agg_col = self.df.select_series(selection)?;
Ok((keys, agg_col))
}
pub fn mean(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Mean);
let opt_agg = agg_col.agg_mean(&self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg);
}
}
DataFrame::new(cols)
}
pub fn sum(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Sum);
let opt_agg = agg_col.agg_sum(&self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg);
}
}
DataFrame::new(cols)
}
pub fn min(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Min);
let opt_agg = agg_col.agg_min(&self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg);
}
}
DataFrame::new(cols)
}
pub fn max(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Max);
let opt_agg = agg_col.agg_max(&self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg);
}
}
DataFrame::new(cols)
}
pub fn first(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::First);
let mut agg = agg_col.agg_first(&self.groups);
agg.rename(&new_name);
cols.push(agg);
}
DataFrame::new(cols)
}
pub fn last(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Last);
let mut agg = agg_col.agg_last(&self.groups);
agg.rename(&new_name);
cols.push(agg);
}
DataFrame::new(cols)
}
pub fn n_unique(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::NUnique);
let opt_agg = agg_col.agg_n_unique(&self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg.into_series());
}
}
DataFrame::new(cols)
}
pub fn quantile(&self, quantile: f64) -> Result<DataFrame> {
if !(0.0..=1.0).contains(&quantile) {
return Err(PolarsError::Other(
"quantile should be within 0.0 and 1.0".into(),
));
}
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Quantile(quantile));
let opt_agg = agg_col.agg_quantile(&self.groups, quantile);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg.into_series());
}
}
DataFrame::new(cols)
}
pub fn median(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Median);
let opt_agg = agg_col.agg_median(&self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
cols.push(agg.into_series());
}
}
DataFrame::new(cols)
}
pub fn count(&self) -> Result<DataFrame> {
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::Count);
let mut builder =
PrimitiveChunkedBuilder::<UInt32Type>::new(&new_name, self.groups.len());
for (_first, idx) in &self.groups {
builder.append_value(idx.len() as u32);
}
let ca = builder.finish();
cols.push(ca.into_series())
}
DataFrame::new(cols)
}
pub fn groups(&self) -> Result<DataFrame> {
let mut cols = self.keys();
let mut column: ListChunked = self
.groups
.iter()
.map(|(_first, idx)| {
let ca: Xob<UInt32Chunked> = idx.iter().map(|&v| v as u32).collect();
ca.into_inner().into_series()
})
.collect();
let new_name = fmt_groupby_column("", GroupByMethod::Groups);
column.rename(&new_name);
cols.push(column.into_series());
cols.shrink_to_fit();
DataFrame::new(cols)
}
pub fn agg<Column, S, Slice>(&self, column_to_agg: &[(Column, Slice)]) -> Result<DataFrame>
where
S: AsRef<str>,
S: AsRef<str>,
Slice: AsRef<[S]>,
Column: AsRef<str>,
{
let mut map = HashMap::with_capacity_and_hasher(column_to_agg.len(), RandomState::new());
column_to_agg.iter().for_each(|(column, aggregations)| {
map.insert(column.as_ref(), aggregations.as_ref());
});
macro_rules! finish_agg_opt {
($self:ident, $name_fmt:expr, $agg_fn:ident, $agg_col:ident, $cols:ident) => {{
let new_name = format![$name_fmt, $agg_col.name()];
let opt_agg = $agg_col.$agg_fn(&$self.groups);
if let Some(mut agg) = opt_agg {
agg.rename(&new_name);
$cols.push(agg.into_series());
}
}};
}
macro_rules! finish_agg {
($self:ident, $name_fmt:expr, $agg_fn:ident, $agg_col:ident, $cols:ident) => {{
let new_name = format![$name_fmt, $agg_col.name()];
let mut agg = $agg_col.$agg_fn(&$self.groups);
agg.rename(&new_name);
$cols.push(agg.into_series());
}};
}
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in &agg_cols {
if let Some(&aggregations) = map.get(agg_col.name()) {
for aggregation_f in aggregations {
match aggregation_f.as_ref() {
"min" => finish_agg_opt!(self, "{}_min", agg_min, agg_col, cols),
"max" => finish_agg_opt!(self, "{}_max", agg_max, agg_col, cols),
"mean" => finish_agg_opt!(self, "{}_mean", agg_mean, agg_col, cols),
"sum" => finish_agg_opt!(self, "{}_sum", agg_sum, agg_col, cols),
"first" => finish_agg!(self, "{}_first", agg_first, agg_col, cols),
"last" => finish_agg!(self, "{}_last", agg_last, agg_col, cols),
"n_unique" => {
finish_agg_opt!(self, "{}_n_unique", agg_n_unique, agg_col, cols)
}
"median" => finish_agg_opt!(self, "{}_median", agg_n_unique, agg_col, cols),
"count" => {
let new_name = format!["{}_count", agg_col.name()];
let mut builder = PrimitiveChunkedBuilder::<UInt32Type>::new(
&new_name,
self.groups.len(),
);
for (_first, idx) in &self.groups {
builder.append_value(idx.len() as u32);
}
let ca = builder.finish();
cols.push(ca.into_series());
}
a => panic!(format!("aggregation: {:?} is not supported", a)),
}
}
}
}
DataFrame::new(cols)
}
pub fn agg_list(&self) -> Result<DataFrame> {
macro_rules! impl_gb {
($type:ty, $agg_col:expr) => {{
let values_builder = PrimitiveBuilder::<$type>::new(self.groups.len());
let mut builder =
ListPrimitiveChunkedBuilder::new("", values_builder, self.groups.len());
for (_first, idx) in &self.groups {
let s = unsafe {
$agg_col.take_iter_unchecked(&mut idx.into_iter().copied(), Some(idx.len()))
};
builder.append_opt_series(Some(&s))
}
builder.finish().into_series()
}};
}
macro_rules! impl_gb_utf8 {
($agg_col:expr) => {{
let values_builder = StringBuilder::new(self.groups.len());
let mut builder =
ListUtf8ChunkedBuilder::new("", values_builder, self.groups.len());
for (_first, idx) in &self.groups {
let s = unsafe {
$agg_col.take_iter_unchecked(&mut idx.into_iter().copied(), Some(idx.len()))
};
builder.append_series(&s)
}
builder.finish().into_series()
}};
}
let (mut cols, agg_cols) = self.prepare_agg()?;
for agg_col in agg_cols {
let new_name = fmt_groupby_column(agg_col.name(), GroupByMethod::List);
let mut agg =
match_arrow_data_type_apply_macro!(agg_col.dtype(), impl_gb, impl_gb_utf8, agg_col);
agg.rename(&new_name);
cols.push(agg);
}
DataFrame::new(cols)
}
pub fn pivot(
&mut self,
pivot_column: &'selection_str str,
values_column: &'selection_str str,
) -> Pivot {
self.selected_agg = Some(vec![pivot_column, values_column]);
Pivot {
gb: self,
pivot_column,
values_column,
}
}
}
#[derive(Copy, Clone)]
pub(crate) enum GroupByMethod {
Min,
Max,
Median,
Mean,
First,
Last,
Sum,
Groups,
NUnique,
Quantile(f64),
Count,
List,
}
pub(crate) fn fmt_groupby_column(name: &str, method: GroupByMethod) -> String {
use GroupByMethod::*;
match method {
Min => format!["{}_min", name],
Max => format!["{}_max", name],
Median => format!["{}_median", name],
Mean => format!["{}_mean", name],
First => format!["{}_first", name],
Last => format!["{}_last", name],
Sum => format!["{}_sum", name],
Groups => "groups".to_string(),
NUnique => format!["{}_n_unique", name],
Count => format!["{}_count", name],
List => format!["{}_agg_list", name],
Quantile(quantile) => format!["{}_quantile_{:.2}", name, quantile],
}
}
pub struct Pivot<'df, 'selection_str> {
gb: &'df GroupBy<'df, 'selection_str>,
pivot_column: &'selection_str str,
values_column: &'selection_str str,
}
pub(crate) trait ChunkPivot {
fn pivot<'a>(
&self,
_pivot_series: &'a (dyn SeriesTrait + 'a),
_keys: Vec<Series>,
_groups: &[(usize, Vec<usize>)],
_agg_type: PivotAgg,
) -> Result<DataFrame> {
Err(PolarsError::InvalidOperation(
"Pivot operation not implemented for this type".into(),
))
}
fn pivot_count<'a>(
&self,
_pivot_series: &'a (dyn SeriesTrait + 'a),
_keys: Vec<Series>,
_groups: &[(usize, Vec<usize>)],
) -> Result<DataFrame> {
Err(PolarsError::InvalidOperation(
"Pivot count operation not implemented for this type".into(),
))
}
}
fn create_column_values_map<'a, T>(
pivot_vec: &'a [Option<Groupable>],
size: usize,
) -> HashMap<&'a Groupable<'a>, Vec<Option<T>>, RandomState> {
let mut columns_agg_map = HashMap::with_capacity_and_hasher(size, RandomState::new());
for opt_column_name in pivot_vec {
if let Some(column_name) = opt_column_name {
columns_agg_map.entry(column_name).or_insert_with(Vec::new);
}
}
columns_agg_map
}
fn create_new_column_builder_map<'a, T>(
pivot_vec: &'a [Option<Groupable>],
groups: &[(usize, Vec<usize>)],
) -> HashMap<&'a Groupable<'a>, PrimitiveChunkedBuilder<T>, RandomState>
where
T: PolarsNumericType,
{
let mut columns_agg_map_main =
HashMap::with_capacity_and_hasher(pivot_vec.len(), RandomState::new());
for opt_column_name in pivot_vec {
if let Some(column_name) = opt_column_name {
columns_agg_map_main.entry(column_name).or_insert_with(|| {
PrimitiveChunkedBuilder::<T>::new(&format!("{:?}", column_name), groups.len())
});
}
}
columns_agg_map_main
}
impl<T> ChunkPivot for ChunkedArray<T>
where
T: PolarsNumericType,
T::Native: Copy + Num + NumCast,
ChunkedArray<T>: IntoSeries,
{
fn pivot<'a>(
&self,
pivot_series: &'a (dyn SeriesTrait + 'a),
keys: Vec<Series>,
groups: &[(usize, Vec<usize>)],
agg_type: PivotAgg,
) -> Result<DataFrame> {
let pivot_vec: Vec<_> = pivot_series.as_groupable_iter()?.collect();
let values_taker = self.take_rand();
let mut columns_agg_map_main = create_new_column_builder_map::<T>(&pivot_vec, groups);
for (_first, idx) in groups {
let mut columns_agg_map_group =
create_column_values_map::<T::Native>(&pivot_vec, idx.len());
for &i in idx {
let opt_pivot_val = unsafe { pivot_vec.get_unchecked(i) };
if let Some(pivot_val) = opt_pivot_val {
let values_val = values_taker.get(i);
if let Some(v) = columns_agg_map_group.get_mut(&pivot_val) {
v.push(values_val)
}
}
}
for (k, v) in &mut columns_agg_map_group {
let main_builder = columns_agg_map_main.get_mut(k).unwrap();
match v.len() {
0 => main_builder.append_null(),
_ => match agg_type {
PivotAgg::First => pivot_agg_first(main_builder, v),
PivotAgg::Sum => pivot_agg_sum(main_builder, v),
PivotAgg::Min => pivot_agg_min(main_builder, v),
PivotAgg::Max => pivot_agg_max(main_builder, v),
PivotAgg::Mean => pivot_agg_mean(main_builder, v),
PivotAgg::Median => pivot_agg_median(main_builder, v),
},
}
}
}
let mut cols = keys;
cols.reserve_exact(columns_agg_map_main.len());
for (_, builder) in columns_agg_map_main {
let ca = builder.finish();
cols.push(ca.into_series());
}
DataFrame::new(cols)
}
fn pivot_count<'a>(
&self,
pivot_series: &'a (dyn SeriesTrait + 'a),
keys: Vec<Series>,
groups: &[(usize, Vec<usize>)],
) -> Result<DataFrame> {
pivot_count_impl(self, pivot_series, keys, groups)
}
}
fn pivot_count_impl<'a, CA: TakeRandom>(
ca: &CA,
pivot_series: &'a (dyn SeriesTrait + 'a),
keys: Vec<Series>,
groups: &[(usize, Vec<usize>)],
) -> Result<DataFrame> {
let pivot_vec: Vec<_> = pivot_series.as_groupable_iter()?.collect();
let mut columns_agg_map_main = create_new_column_builder_map::<UInt32Type>(&pivot_vec, groups);
for (_first, idx) in groups {
let mut columns_agg_map_group = create_column_values_map::<CA::Item>(&pivot_vec, idx.len());
for &i in idx {
let opt_pivot_val = unsafe { pivot_vec.get_unchecked(i) };
if let Some(pivot_val) = opt_pivot_val {
let values_val = ca.get(i);
if let Some(v) = columns_agg_map_group.get_mut(&pivot_val) {
v.push(values_val)
}
}
}
for (k, v) in &mut columns_agg_map_group {
let main_builder = columns_agg_map_main.get_mut(k).unwrap();
main_builder.append_value(v.len() as u32)
}
}
let mut cols = keys;
cols.reserve_exact(columns_agg_map_main.len());
for (_, builder) in columns_agg_map_main {
let ca = builder.finish();
cols.push(ca.into_series());
}
DataFrame::new(cols)
}
impl ChunkPivot for BooleanChunked {
fn pivot_count<'a>(
&self,
pivot_series: &'a (dyn SeriesTrait + 'a),
keys: Vec<Series>,
groups: &[(usize, Vec<usize>)],
) -> Result<DataFrame> {
pivot_count_impl(self, pivot_series, keys, groups)
}
}
impl ChunkPivot for Utf8Chunked {
fn pivot_count<'a>(
&self,
pivot_series: &'a (dyn SeriesTrait + 'a),
keys: Vec<Series>,
groups: &[(usize, Vec<usize>)],
) -> Result<DataFrame> {
pivot_count_impl(&self, pivot_series, keys, groups)
}
}
impl ChunkPivot for ListChunked {}
#[cfg(feature = "object")]
impl<T> ChunkPivot for ObjectChunked<T> {}
pub enum PivotAgg {
First,
Sum,
Min,
Max,
Mean,
Median,
}
fn pivot_agg_first<T>(builder: &mut PrimitiveChunkedBuilder<T>, v: &[Option<T::Native>])
where
T: PolarsNumericType,
{
builder.append_option(v[0]);
}
fn pivot_agg_median<T>(builder: &mut PrimitiveChunkedBuilder<T>, v: &mut Vec<Option<T::Native>>)
where
T: PolarsNumericType,
T::Native: PartialOrd,
{
v.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
builder.append_option(v[v.len() / 2]);
}
fn pivot_agg_sum<T>(builder: &mut PrimitiveChunkedBuilder<T>, v: &[Option<T::Native>])
where
T: PolarsNumericType,
T::Native: Num + Zero,
{
builder.append_option(v.iter().copied().fold_options(Zero::zero(), Add::add));
}
fn pivot_agg_mean<T>(builder: &mut PrimitiveChunkedBuilder<T>, v: &[Option<T::Native>])
where
T: PolarsNumericType,
T::Native: Num + Zero + NumCast,
{
builder.append_option(
v.iter()
.copied()
.fold_options::<T::Native, T::Native, _>(Zero::zero(), Add::add)
.map(|sum_val| sum_val / NumCast::from(v.len()).unwrap()),
);
}
fn pivot_agg_min<T>(builder: &mut PrimitiveChunkedBuilder<T>, v: &[Option<T::Native>])
where
T: PolarsNumericType,
{
let mut min = None;
for opt_val in v {
if let Some(val) = opt_val {
match min {
None => min = Some(*val),
Some(minimum) => {
if val < &minimum {
min = Some(*val)
}
}
}
}
}
builder.append_option(min);
}
fn pivot_agg_max<T>(builder: &mut PrimitiveChunkedBuilder<T>, v: &[Option<T::Native>])
where
T: PolarsNumericType,
{
let mut max = None;
for opt_val in v {
if let Some(val) = opt_val {
match max {
None => max = Some(*val),
Some(maximum) => {
if val > &maximum {
max = Some(*val)
}
}
}
}
}
builder.append_option(max);
}
impl<'df, 'sel_str> Pivot<'df, 'sel_str> {
pub fn count(&self) -> Result<DataFrame> {
let pivot_series = self.gb.df.column(self.pivot_column)?;
let values_series = self.gb.df.column(self.values_column)?;
values_series.pivot_count(&**pivot_series, self.gb.keys(), &self.gb.groups)
}
pub fn first(&self) -> Result<DataFrame> {
let pivot_series = self.gb.df.column(self.pivot_column)?;
let values_series = self.gb.df.column(self.values_column)?;
values_series.pivot(
&**pivot_series,
self.gb.keys(),
&self.gb.groups,
PivotAgg::First,
)
}
pub fn sum(&self) -> Result<DataFrame> {
let pivot_series = self.gb.df.column(self.pivot_column)?;
let values_series = self.gb.df.column(self.values_column)?;
values_series.pivot(
&**pivot_series,
self.gb.keys(),
&self.gb.groups,
PivotAgg::Sum,
)
}
pub fn min(&self) -> Result<DataFrame> {
let pivot_series = self.gb.df.column(self.pivot_column)?;
let values_series = self.gb.df.column(self.values_column)?;
values_series.pivot(
&**pivot_series,
self.gb.keys(),
&self.gb.groups,
PivotAgg::Min,
)
}
pub fn max(&self) -> Result<DataFrame> {
let pivot_series = self.gb.df.column(self.pivot_column)?;
let values_series = self.gb.df.column(self.values_column)?;
values_series.pivot(
&**pivot_series,
self.gb.keys(),
&self.gb.groups,
PivotAgg::Max,
)
}
pub fn mean(&self) -> Result<DataFrame> {
let pivot_series = self.gb.df.column(self.pivot_column)?;
let values_series = self.gb.df.column(self.values_column)?;
values_series.pivot(
&**pivot_series,
self.gb.keys(),
&self.gb.groups,
PivotAgg::Mean,
)
}
pub fn median(&self) -> Result<DataFrame> {
let pivot_series = self.gb.df.column(self.pivot_column)?;
let values_series = self.gb.df.column(self.values_column)?;
values_series.pivot(
&**pivot_series,
self.gb.keys(),
&self.gb.groups,
PivotAgg::Median,
)
}
}
#[cfg(test)]
mod test {
use crate::prelude::*;
#[test]
fn test_group_by() {
let s0 = Date32Chunked::parse_from_str_slice(
"date",
&[
"2020-08-21",
"2020-08-21",
"2020-08-22",
"2020-08-23",
"2020-08-22",
],
"%Y-%m-%d",
)
.into_series();
let s1 = Series::new("temp", [20, 10, 7, 9, 1].as_ref());
let s2 = Series::new("rain", [0.2, 0.1, 0.3, 0.1, 0.01].as_ref());
let df = DataFrame::new(vec![s0, s1, s2]).unwrap();
println!("{:?}", df);
println!(
"{:?}",
df.groupby("date").unwrap().select("temp").count().unwrap()
);
println!(
"{:?}",
df.groupby("date")
.unwrap()
.select(&["temp", "rain"])
.mean()
.unwrap()
);
println!(
"multiple keys {:?}",
df.groupby(&["date", "temp"])
.unwrap()
.select("rain")
.mean()
.unwrap()
);
println!(
"{:?}",
df.groupby("date").unwrap().select("temp").sum().unwrap()
);
println!(
"{:?}",
df.groupby("date").unwrap().select("temp").min().unwrap()
);
println!(
"{:?}",
df.groupby("date").unwrap().select("temp").max().unwrap()
);
println!(
"{:?}",
df.groupby("date")
.unwrap()
.select("temp")
.agg_list()
.unwrap()
);
println!(
"{:?}",
df.groupby("date").unwrap().select("temp").first().unwrap()
);
println!(
"{:?}",
df.groupby("date").unwrap().select("temp").last().unwrap()
);
println!(
"{:?}",
df.groupby("date")
.unwrap()
.select("temp")
.n_unique()
.unwrap()
);
println!(
"{:?}",
df.groupby("date")
.unwrap()
.select("temp")
.quantile(0.2)
.unwrap()
);
println!(
"{:?}",
df.groupby("date").unwrap().select("temp").median().unwrap()
);
let gb = df.groupby("date").unwrap().n_unique().unwrap();
println!("{:?}", df.groupby("date").unwrap().n_unique().unwrap());
assert_eq!(gb.width(), 2);
println!(
"{:?}",
df.groupby("date")
.unwrap()
.agg(&[("temp", &["n_unique", "sum", "min"])])
.unwrap()
);
println!("{:?}", df.groupby("date").unwrap().groups().unwrap());
}
#[test]
fn test_pivot() {
let s0 = Series::new("foo", ["A", "A", "B", "B", "C"].as_ref());
let s1 = Series::new("N", [1, 2, 2, 4, 2].as_ref());
let s2 = Series::new("bar", ["k", "l", "m", "m", "l"].as_ref());
let df = DataFrame::new(vec![s0, s1, s2]).unwrap();
println!("{:?}", df);
let pvt = df.groupby("foo").unwrap().pivot("bar", "N").sum().unwrap();
assert_eq!(
Vec::from(&pvt.column("m").unwrap().i32().unwrap().sort(false)),
&[None, None, Some(6)]
);
let pvt = df.groupby("foo").unwrap().pivot("bar", "N").min().unwrap();
assert_eq!(
Vec::from(&pvt.column("m").unwrap().i32().unwrap().sort(false)),
&[None, None, Some(2)]
);
let pvt = df.groupby("foo").unwrap().pivot("bar", "N").max().unwrap();
assert_eq!(
Vec::from(&pvt.column("m").unwrap().i32().unwrap().sort(false)),
&[None, None, Some(4)]
);
let pvt = df.groupby("foo").unwrap().pivot("bar", "N").mean().unwrap();
assert_eq!(
Vec::from(&pvt.column("m").unwrap().i32().unwrap().sort(false)),
&[None, None, Some(3)]
);
let pvt = df
.groupby("foo")
.unwrap()
.pivot("bar", "N")
.count()
.unwrap();
assert_eq!(
Vec::from(&pvt.column("m").unwrap().u32().unwrap().sort(false)),
&[Some(0), Some(0), Some(2)]
);
}
#[test]
fn test_static_groupby_by_12_columns() {
let s0 = Series::new("G1", ["A", "A", "B", "B", "C"].as_ref());
let s1 = Series::new("N", [1, 2, 2, 4, 2].as_ref());
let s2 = Series::new("G2", ["k", "l", "m", "m", "l"].as_ref());
let s3 = Series::new("G3", ["a", "b", "c", "c", "d"].as_ref());
let s4 = Series::new("G4", ["1", "2", "3", "3", "4"].as_ref());
let s5 = Series::new("G5", ["X", "Y", "Z", "Z", "W"].as_ref());
let s6 = Series::new("G6", [false, true, true, true, false].as_ref());
let s7 = Series::new("G7", ["r", "x", "q", "q", "o"].as_ref());
let s8 = Series::new("G8", ["R", "X", "Q", "Q", "O"].as_ref());
let s9 = Series::new("G9", [1, 2, 3, 3, 4].as_ref());
let s10 = Series::new("G10", [".", "!", "?", "?", "/"].as_ref());
let s11 = Series::new("G11", ["(", ")", "@", "@", "$"].as_ref());
let s12 = Series::new("G12", ["-", "_", ";", ";", ","].as_ref());
let df =
DataFrame::new(vec![s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]).unwrap();
println!("{:?}", df);
let adf = df
.groupby(&[
"G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
])
.unwrap()
.select("N")
.sum()
.unwrap();
println!("{:?}", adf);
assert_eq!(
Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
&[Some(1), Some(2), Some(2), Some(6)]
);
}
#[test]
fn test_dynamic_groupby_by_13_columns() {
let series_content = ["A", "A", "B", "B", "C"];
let series_names = [
"G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
];
let mut series = Vec::with_capacity(14);
for series_name in &series_names {
let serie = Series::new(series_name, series_content.as_ref());
series.push(serie);
}
let serie = Series::new("N", [1, 2, 3, 3, 4].as_ref());
series.push(serie);
let df = DataFrame::new(series).unwrap();
println!("{:?}", df);
let adf = df
.groupby(&series_names)
.unwrap()
.select("N")
.sum()
.unwrap();
println!("{:?}", adf);
for series_name in &series_names {
assert_eq!(
Vec::from(&adf.column(series_name).unwrap().utf8().unwrap().sort(false)),
&[Some("A"), Some("B"), Some("C")]
);
}
assert_eq!(
Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
&[Some(3), Some(4), Some(6)]
);
}
#[test]
fn test_groupby_floats() {
let df = df! {"flt" => [1., 1., 2., 2., 3.],
"val" => [1, 1, 1, 1, 1]
}
.unwrap();
let res = df.groupby("flt").unwrap().sum().unwrap();
let res = res.sort("flt", false).unwrap();
assert_eq!(
Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
&[Some(2), Some(2), Some(1)]
);
}
}