h3ron_polars/algorithm/frame/
compact.rs

1use crate::algorithm::chunkedarray::H3CompactCells;
2use crate::frame::H3DataFrame;
3use crate::{AsH3CellChunked, Error};
4use h3ron::collections::H3CellSet;
5use h3ron::iter::change_resolution;
6use h3ron::{H3Cell, Index};
7use polars::export::rayon::iter::ParallelIterator;
8use polars::prelude::{
9    col, ChunkUnique, DataFrame, DataType, IntoLazy, IntoSeries, NamedFrom, Series,
10};
11use polars_core::POOL;
12use std::borrow::Borrow;
13use std::cmp::Ordering;
14
15pub trait H3CompactDataframe {
16    /// Compact the cells in the column named `cell_column_name`.
17    ///
18    /// This is done by first grouping the dataframe using all other columns and then
19    /// compacting the list of cells of each group.
20    fn h3_compact_dataframe<S>(
21        self,
22        cell_column_name: S,
23        return_exploded: bool,
24    ) -> Result<Self, Error>
25    where
26        Self: Sized,
27        S: AsRef<str>;
28}
29
30impl H3CompactDataframe for DataFrame {
31    fn h3_compact_dataframe<S>(
32        self,
33        cell_column_name: S,
34        return_exploded: bool,
35    ) -> Result<Self, Error>
36    where
37        S: AsRef<str>,
38    {
39        compact_df(self, cell_column_name.as_ref(), return_exploded)
40    }
41}
42
43fn compact_df(
44    df: DataFrame,
45    cell_column_name: &str,
46    return_exploded: bool,
47) -> Result<DataFrame, Error> {
48    let group_by_columns = df
49        .fields()
50        .iter()
51        .filter_map(|field| {
52            if field.name() != cell_column_name {
53                Some(col(field.name()))
54            } else {
55                None
56            }
57        })
58        .collect::<Vec<_>>();
59
60    if group_by_columns.is_empty() {
61        let cellchunked = df.column(cell_column_name)?.u64()?.h3cell();
62        let compacted_series = Series::new(cell_column_name, cellchunked.h3_compact_cells()?);
63
64        if return_exploded {
65            Ok(DataFrame::new(vec![compacted_series])?)
66        } else {
67            Ok(DataFrame::new(vec![Series::new(
68                cell_column_name,
69                vec![compacted_series],
70            )])?)
71        }
72    } else {
73        let grouped = df
74            .lazy()
75            .groupby(&group_by_columns)
76            .agg(&[col(cell_column_name).unique()])
77            .collect()?;
78
79        let listchunked_cells = grouped.column(cell_column_name)?.list()?;
80        let compacted_series_vec = POOL.install(|| {
81            // Ordering is preserved. see https://github.com/rayon-rs/rayon/issues/551
82            listchunked_cells
83                .par_iter()
84                .map(compact_maybe_series)
85                .collect::<Result<Vec<_>, _>>()
86        })?;
87
88        let mut grouped = grouped.drop(cell_column_name)?;
89        grouped.with_column(Series::new(cell_column_name, compacted_series_vec))?;
90
91        if return_exploded {
92            Ok(grouped.explode([cell_column_name])?)
93        } else {
94            Ok(grouped)
95        }
96    }
97}
98
99fn compact_maybe_series(maybe_series: Option<Series>) -> Result<Series, Error> {
100    let compacted_series = if let Some(series) = maybe_series {
101        series.u64()?.h3cell().h3_compact_cells()?.into_series()
102    } else {
103        Series::new_empty("", &DataType::UInt64)
104    };
105    Ok(compacted_series)
106}
107
108pub trait H3UncompactDataframe {
109    /// Uncompact the cells in the column named `cell_column_name`.
110    ///
111    /// Implements the reverse of [H3CompactDataframe].
112    fn h3_uncompact_dataframe<S>(
113        self,
114        cell_column_name: S,
115        target_resolution: u8,
116    ) -> Result<Self, Error>
117    where
118        Self: Sized,
119        S: AsRef<str>;
120
121    /// Uncompact the cells in the column named `cell_column_name` while only returning the cells from
122    /// the given `subset`.
123    ///
124    /// Implements the reverse of [H3CompactDataframe].
125    fn h3_uncompact_dataframe_subset<S>(
126        self,
127        cell_column_name: S,
128        target_resolution: u8,
129        subset: &H3CellSet,
130    ) -> Result<Self, Error>
131    where
132        Self: Sized,
133        S: AsRef<str>;
134
135    /// Uncompact the cells in the column named `cell_column_name` while only returning the cells from
136    /// the given `subset`.
137    ///
138    /// Implements the reverse of [H3CompactDataframe].
139    fn h3_uncompact_dataframe_subset_iter<S, I>(
140        self,
141        cell_column_name: S,
142        target_resolution: u8,
143        subset: I,
144    ) -> Result<Self, Error>
145    where
146        Self: Sized,
147        S: AsRef<str>,
148        I: IntoIterator,
149        I::Item: Borrow<H3Cell>,
150    {
151        let subset =
152            change_resolution(subset, target_resolution).collect::<Result<H3CellSet, _>>()?;
153        self.h3_uncompact_dataframe_subset(cell_column_name, target_resolution, &subset)
154    }
155}
156
157impl H3UncompactDataframe for DataFrame {
158    fn h3_uncompact_dataframe<S>(
159        self,
160        cell_column_name: S,
161        target_resolution: u8,
162    ) -> Result<Self, Error>
163    where
164        Self: Sized,
165        S: AsRef<str>,
166    {
167        uncompact_df(self, cell_column_name.as_ref(), target_resolution, |_| true)
168    }
169
170    fn h3_uncompact_dataframe_subset<S>(
171        self,
172        cell_column_name: S,
173        target_resolution: u8,
174        subset: &H3CellSet,
175    ) -> Result<Self, Error>
176    where
177        Self: Sized,
178        S: AsRef<str>,
179    {
180        uncompact_df(self, cell_column_name.as_ref(), target_resolution, |cell| {
181            subset.contains(cell)
182        })
183    }
184}
185
186impl H3DataFrame<H3Cell> {
187    /// Compact the cells.
188    ///
189    /// This is done by first grouping the dataframe using all other columns and then
190    /// compacting the list of cells of each group.
191    pub fn h3_compact_dataframe(&self, return_exploded: bool) -> Result<Self, Error> {
192        self.dataframe()
193            .clone()
194            .h3_compact_dataframe(self.h3index_column_name(), return_exploded)
195            .map(|df| H3DataFrame::from_dataframe_nonvalidated(df, self.h3index_column_name()))
196    }
197
198    /// Uncompact the cells.
199    pub fn h3_uncompact_dataframe(&self, target_resolution: u8) -> Result<Self, Error> {
200        self.dataframe()
201            .clone()
202            .h3_uncompact_dataframe(self.h3index_column_name(), target_resolution)
203            .map(|df| H3DataFrame::from_dataframe_nonvalidated(df, self.h3index_column_name()))
204    }
205
206    /// Uncompact the cells while only returning the cells from
207    /// the given `subset`.
208    pub fn h3_uncompact_dataframe_subset(
209        &self,
210        target_resolution: u8,
211        subset: &H3CellSet,
212    ) -> Result<Self, Error> {
213        self.dataframe()
214            .clone()
215            .h3_uncompact_dataframe_subset(self.h3index_column_name(), target_resolution, subset)
216            .map(|df| H3DataFrame::from_dataframe_nonvalidated(df, self.h3index_column_name()))
217    }
218
219    /// Uncompact the cells while only returning the cells from
220    /// the given `subset`.
221    pub fn h3_uncompact_dataframe_subset_iter<I>(
222        &self,
223        target_resolution: u8,
224        subset: I,
225    ) -> Result<Self, Error>
226    where
227        I: IntoIterator,
228        I::Item: Borrow<H3Cell>,
229    {
230        let subset =
231            change_resolution(subset, target_resolution).collect::<Result<H3CellSet, _>>()?;
232        self.h3_uncompact_dataframe_subset(target_resolution, &subset)
233    }
234}
235
236const UNCOMPACT_JOIN_COL_NAME: &str = "_uncompact_join_idx";
237
238fn uncompact_df<Filter>(
239    df: DataFrame,
240    cell_column_name: &str,
241    target_resolution: u8,
242    filter: Filter,
243) -> Result<DataFrame, Error>
244where
245    Filter: Fn(&H3Cell) -> bool,
246{
247    let unique_cell_ca = df.column(cell_column_name)?.u64()?.unique()?;
248    let cellchunked = unique_cell_ca.h3cell();
249
250    let mut original_indexes = Vec::with_capacity(cellchunked.len());
251    let mut uncompacted_indexes = Vec::with_capacity(cellchunked.len());
252
253    // invalid cells are ignored
254    for cell in cellchunked.iter_indexes_validated().flatten().flatten() {
255        match cell.resolution().cmp(&target_resolution) {
256            Ordering::Less => {
257                for cell_child in cell.get_children(target_resolution)?.iter().filter(&filter) {
258                    original_indexes.push(cell.h3index());
259                    uncompacted_indexes.push(cell_child.h3index());
260                }
261            }
262            Ordering::Equal => {
263                if filter(&cell) {
264                    original_indexes.push(cell.h3index());
265                    uncompacted_indexes.push(cell.h3index());
266                }
267            }
268            Ordering::Greater => {
269                // ignore higher resolution data above the requested target_resolution
270            }
271        }
272    }
273
274    if original_indexes == uncompacted_indexes {
275        // nothing to do
276        return Ok(df);
277    }
278
279    let df = df
280        .lazy()
281        .inner_join(
282            DataFrame::new(vec![
283                Series::new(cell_column_name, original_indexes),
284                Series::new(UNCOMPACT_JOIN_COL_NAME, uncompacted_indexes),
285            ])?
286            .lazy(),
287            col(cell_column_name),
288            col(cell_column_name),
289        )
290        .drop_columns([cell_column_name])
291        .rename([UNCOMPACT_JOIN_COL_NAME], [cell_column_name])
292        .collect()?;
293
294    Ok(df)
295}
296
297#[cfg(test)]
298mod tests {
299    use crate::algorithm::chunkedarray::H3Resolution;
300    use crate::algorithm::frame::{H3CompactDataframe, H3UncompactDataframe};
301    use crate::algorithm::tests::make_cell_dataframe;
302    use crate::AsH3CellChunked;
303    use crate::NamedFromIndexes;
304    use h3ron::{H3Cell, HasH3Resolution};
305    use polars::prelude::{DataFrame, DataType, Series};
306
307    const CELL_COL_NAME: &str = "cell";
308
309    fn compact_roundtrip_helper(value: Option<u32>) {
310        let max_res = 8;
311        let df = make_cell_dataframe(CELL_COL_NAME, max_res, value).unwrap();
312        let shape_before = df.shape();
313
314        let compacted = df.h3_compact_dataframe(CELL_COL_NAME, true).unwrap();
315
316        assert!(shape_before.0 > compacted.shape().0);
317        assert_eq!(shape_before.1, compacted.shape().1);
318        assert_eq!(
319            compacted.column(CELL_COL_NAME).unwrap().dtype(),
320            &DataType::UInt64
321        );
322
323        let compacted_resolutions = compacted
324            .column(CELL_COL_NAME)
325            .unwrap()
326            .u64()
327            .unwrap()
328            .h3cell()
329            .h3_resolution();
330        assert!(compacted_resolutions.len() > 1);
331        for res in &compacted_resolutions {
332            assert!(res.unwrap() <= max_res);
333        }
334
335        let uncompacted = compacted
336            .h3_uncompact_dataframe(CELL_COL_NAME, max_res)
337            .unwrap();
338        assert_eq!(uncompacted.shape(), shape_before);
339        assert_eq!(
340            uncompacted.column(CELL_COL_NAME).unwrap().dtype(),
341            &DataType::UInt64
342        );
343
344        let resolutions = uncompacted
345            .column(CELL_COL_NAME)
346            .unwrap()
347            .u64()
348            .unwrap()
349            .h3cell()
350            .h3_resolution();
351        assert_eq!(uncompacted.shape().0, resolutions.len());
352        for res in &resolutions {
353            assert_eq!(res.unwrap(), max_res);
354        }
355    }
356
357    #[test]
358    fn compact_roundtrip_with_value() {
359        compact_roundtrip_helper(Some(7))
360    }
361
362    #[test]
363    fn compact_roundtrip_without_value() {
364        compact_roundtrip_helper(None)
365    }
366
367    #[test]
368    fn uncompact_subset() {
369        let origin_cell = H3Cell::from_coordinate((12.0, 12.0).into(), 5).unwrap();
370
371        let df = DataFrame::new(vec![Series::new_from_indexes(
372            CELL_COL_NAME,
373            origin_cell
374                .grid_disk(12)
375                .unwrap()
376                .iter()
377                .collect::<Vec<_>>(),
378        )])
379        .unwrap();
380
381        let subset_origin = origin_cell.center_child(7).unwrap();
382        let subset = {
383            let mut subset = subset_origin
384                .grid_disk(1)
385                .unwrap()
386                .iter()
387                .collect::<Vec<_>>();
388            subset.sort_unstable();
389            subset
390        };
391
392        let subset_df = df
393            .h3_uncompact_dataframe_subset_iter(
394                CELL_COL_NAME,
395                subset_origin.h3_resolution(),
396                subset.as_slice(),
397            )
398            .unwrap();
399        assert_eq!(subset_df.shape().0, subset.len());
400
401        let subset_from_subset_df = {
402            let mut sbs = subset_df
403                .column(CELL_COL_NAME)
404                .unwrap()
405                .u64()
406                .unwrap()
407                .h3cell()
408                .iter_indexes_validated()
409                .flatten()
410                .collect::<Result<Vec<_>, _>>()
411                .unwrap();
412            sbs.sort();
413            sbs
414        };
415        assert_eq!(subset, subset_from_subset_df);
416    }
417}