polars_expr/groups/
mod.rs

1use std::any::Any;
2
3use arrow::bitmap::BitmapBuilder;
4use polars_core::prelude::*;
5#[cfg(feature = "dtype-categorical")]
6use polars_core::with_match_categorical_physical_type;
7use polars_core::with_match_physical_numeric_polars_type;
8use polars_utils::IdxSize;
9use polars_utils::hashing::HashPartitioner;
10
11use crate::hash_keys::HashKeys;
12
13mod binview;
14mod row_encoded;
15mod single_key;
16
17/// A Grouper maps keys to groups, such that duplicate keys map to the same group.
18pub trait Grouper: Any + Send + Sync {
19    /// Creates a new empty Grouper similar to this one.
20    fn new_empty(&self) -> Box<dyn Grouper>;
21
22    /// Reserves space for the given number additional groups.
23    fn reserve(&mut self, additional: usize);
24
25    /// Returns the number of groups in this Grouper.
26    fn num_groups(&self) -> IdxSize;
27
28    /// Inserts the given subset of keys into this Grouper. If groups_idxs is
29    /// passed it is extended such with the group index of keys[subset[i]].
30    ///
31    /// # Safety
32    /// The subset indexes must be in-bounds.
33    unsafe fn insert_keys_subset(
34        &mut self,
35        keys: &HashKeys,
36        subset: &[IdxSize],
37        group_idxs: Option<&mut Vec<IdxSize>>,
38    );
39
40    /// Returns the keys in this Grouper in group order, that is the key for
41    /// group i is returned in row i.
42    fn get_keys_in_group_order(&self, schema: &Schema) -> DataFrame;
43
44    /// Returns the (indices of the) keys found in the groupers. If
45    /// invert is true it instead returns the keys not found in the groupers.
46    /// # Safety
47    /// All groupers must have the same schema.
48    unsafe fn probe_partitioned_groupers(
49        &self,
50        groupers: &[Box<dyn Grouper>],
51        keys: &HashKeys,
52        partitioner: &HashPartitioner,
53        invert: bool,
54        probe_matches: &mut Vec<IdxSize>,
55    );
56
57    /// Returns for each key if it is found in the groupers. If invert is true
58    /// it returns true if it isn't found.
59    /// # Safety
60    /// All groupers must have the same schema.
61    unsafe fn contains_key_partitioned_groupers(
62        &self,
63        groupers: &[Box<dyn Grouper>],
64        keys: &HashKeys,
65        partitioner: &HashPartitioner,
66        invert: bool,
67        contains_key: &mut BitmapBuilder,
68    );
69
70    fn as_any(&self) -> &dyn Any;
71}
72
73pub fn new_hash_grouper(key_schema: Arc<Schema>) -> Box<dyn Grouper> {
74    if key_schema.len() > 1 {
75        Box::new(row_encoded::RowEncodedHashGrouper::new())
76    } else {
77        let (_name, dt) = key_schema.get_at_index(0).unwrap();
78        match dt {
79            dt if dt.is_primitive_numeric() | dt.is_temporal() => {
80                with_match_physical_numeric_polars_type!(dt.to_physical(), |$T| {
81                    Box::new(single_key::SingleKeyHashGrouper::<$T>::new())
82                })
83            },
84
85            #[cfg(feature = "dtype-decimal")]
86            DataType::Decimal(_, _) => {
87                Box::new(single_key::SingleKeyHashGrouper::<Int128Type>::new())
88            },
89            #[cfg(feature = "dtype-categorical")]
90            dt @ (DataType::Enum(_, _) | DataType::Categorical(_, _)) => {
91                with_match_categorical_physical_type!(dt.cat_physical().unwrap(), |$C| {
92                    Box::new(single_key::SingleKeyHashGrouper::<<$C as PolarsCategoricalType>::PolarsPhysical>::new())
93                })
94            },
95
96            DataType::String | DataType::Binary => Box::new(binview::BinviewHashGrouper::new()),
97
98            _ => Box::new(row_encoded::RowEncodedHashGrouper::new()),
99        }
100    }
101}