h3ron_polars/algorithm/frame/
compact.rs1use crate::algorithm::chunkedarray::H3CompactCells;
2use crate::frame::H3DataFrame;
3use crate::{AsH3CellChunked, Error};
4use h3ron::collections::H3CellSet;
5use h3ron::iter::change_resolution;
6use h3ron::{H3Cell, Index};
7use polars::export::rayon::iter::ParallelIterator;
8use polars::prelude::{
9 col, ChunkUnique, DataFrame, DataType, IntoLazy, IntoSeries, NamedFrom, Series,
10};
11use polars_core::POOL;
12use std::borrow::Borrow;
13use std::cmp::Ordering;
14
15pub trait H3CompactDataframe {
16 fn h3_compact_dataframe<S>(
21 self,
22 cell_column_name: S,
23 return_exploded: bool,
24 ) -> Result<Self, Error>
25 where
26 Self: Sized,
27 S: AsRef<str>;
28}
29
30impl H3CompactDataframe for DataFrame {
31 fn h3_compact_dataframe<S>(
32 self,
33 cell_column_name: S,
34 return_exploded: bool,
35 ) -> Result<Self, Error>
36 where
37 S: AsRef<str>,
38 {
39 compact_df(self, cell_column_name.as_ref(), return_exploded)
40 }
41}
42
43fn compact_df(
44 df: DataFrame,
45 cell_column_name: &str,
46 return_exploded: bool,
47) -> Result<DataFrame, Error> {
48 let group_by_columns = df
49 .fields()
50 .iter()
51 .filter_map(|field| {
52 if field.name() != cell_column_name {
53 Some(col(field.name()))
54 } else {
55 None
56 }
57 })
58 .collect::<Vec<_>>();
59
60 if group_by_columns.is_empty() {
61 let cellchunked = df.column(cell_column_name)?.u64()?.h3cell();
62 let compacted_series = Series::new(cell_column_name, cellchunked.h3_compact_cells()?);
63
64 if return_exploded {
65 Ok(DataFrame::new(vec![compacted_series])?)
66 } else {
67 Ok(DataFrame::new(vec![Series::new(
68 cell_column_name,
69 vec![compacted_series],
70 )])?)
71 }
72 } else {
73 let grouped = df
74 .lazy()
75 .groupby(&group_by_columns)
76 .agg(&[col(cell_column_name).unique()])
77 .collect()?;
78
79 let listchunked_cells = grouped.column(cell_column_name)?.list()?;
80 let compacted_series_vec = POOL.install(|| {
81 listchunked_cells
83 .par_iter()
84 .map(compact_maybe_series)
85 .collect::<Result<Vec<_>, _>>()
86 })?;
87
88 let mut grouped = grouped.drop(cell_column_name)?;
89 grouped.with_column(Series::new(cell_column_name, compacted_series_vec))?;
90
91 if return_exploded {
92 Ok(grouped.explode([cell_column_name])?)
93 } else {
94 Ok(grouped)
95 }
96 }
97}
98
99fn compact_maybe_series(maybe_series: Option<Series>) -> Result<Series, Error> {
100 let compacted_series = if let Some(series) = maybe_series {
101 series.u64()?.h3cell().h3_compact_cells()?.into_series()
102 } else {
103 Series::new_empty("", &DataType::UInt64)
104 };
105 Ok(compacted_series)
106}
107
108pub trait H3UncompactDataframe {
109 fn h3_uncompact_dataframe<S>(
113 self,
114 cell_column_name: S,
115 target_resolution: u8,
116 ) -> Result<Self, Error>
117 where
118 Self: Sized,
119 S: AsRef<str>;
120
121 fn h3_uncompact_dataframe_subset<S>(
126 self,
127 cell_column_name: S,
128 target_resolution: u8,
129 subset: &H3CellSet,
130 ) -> Result<Self, Error>
131 where
132 Self: Sized,
133 S: AsRef<str>;
134
135 fn h3_uncompact_dataframe_subset_iter<S, I>(
140 self,
141 cell_column_name: S,
142 target_resolution: u8,
143 subset: I,
144 ) -> Result<Self, Error>
145 where
146 Self: Sized,
147 S: AsRef<str>,
148 I: IntoIterator,
149 I::Item: Borrow<H3Cell>,
150 {
151 let subset =
152 change_resolution(subset, target_resolution).collect::<Result<H3CellSet, _>>()?;
153 self.h3_uncompact_dataframe_subset(cell_column_name, target_resolution, &subset)
154 }
155}
156
157impl H3UncompactDataframe for DataFrame {
158 fn h3_uncompact_dataframe<S>(
159 self,
160 cell_column_name: S,
161 target_resolution: u8,
162 ) -> Result<Self, Error>
163 where
164 Self: Sized,
165 S: AsRef<str>,
166 {
167 uncompact_df(self, cell_column_name.as_ref(), target_resolution, |_| true)
168 }
169
170 fn h3_uncompact_dataframe_subset<S>(
171 self,
172 cell_column_name: S,
173 target_resolution: u8,
174 subset: &H3CellSet,
175 ) -> Result<Self, Error>
176 where
177 Self: Sized,
178 S: AsRef<str>,
179 {
180 uncompact_df(self, cell_column_name.as_ref(), target_resolution, |cell| {
181 subset.contains(cell)
182 })
183 }
184}
185
186impl H3DataFrame<H3Cell> {
187 pub fn h3_compact_dataframe(&self, return_exploded: bool) -> Result<Self, Error> {
192 self.dataframe()
193 .clone()
194 .h3_compact_dataframe(self.h3index_column_name(), return_exploded)
195 .map(|df| H3DataFrame::from_dataframe_nonvalidated(df, self.h3index_column_name()))
196 }
197
198 pub fn h3_uncompact_dataframe(&self, target_resolution: u8) -> Result<Self, Error> {
200 self.dataframe()
201 .clone()
202 .h3_uncompact_dataframe(self.h3index_column_name(), target_resolution)
203 .map(|df| H3DataFrame::from_dataframe_nonvalidated(df, self.h3index_column_name()))
204 }
205
206 pub fn h3_uncompact_dataframe_subset(
209 &self,
210 target_resolution: u8,
211 subset: &H3CellSet,
212 ) -> Result<Self, Error> {
213 self.dataframe()
214 .clone()
215 .h3_uncompact_dataframe_subset(self.h3index_column_name(), target_resolution, subset)
216 .map(|df| H3DataFrame::from_dataframe_nonvalidated(df, self.h3index_column_name()))
217 }
218
219 pub fn h3_uncompact_dataframe_subset_iter<I>(
222 &self,
223 target_resolution: u8,
224 subset: I,
225 ) -> Result<Self, Error>
226 where
227 I: IntoIterator,
228 I::Item: Borrow<H3Cell>,
229 {
230 let subset =
231 change_resolution(subset, target_resolution).collect::<Result<H3CellSet, _>>()?;
232 self.h3_uncompact_dataframe_subset(target_resolution, &subset)
233 }
234}
235
236const UNCOMPACT_JOIN_COL_NAME: &str = "_uncompact_join_idx";
237
238fn uncompact_df<Filter>(
239 df: DataFrame,
240 cell_column_name: &str,
241 target_resolution: u8,
242 filter: Filter,
243) -> Result<DataFrame, Error>
244where
245 Filter: Fn(&H3Cell) -> bool,
246{
247 let unique_cell_ca = df.column(cell_column_name)?.u64()?.unique()?;
248 let cellchunked = unique_cell_ca.h3cell();
249
250 let mut original_indexes = Vec::with_capacity(cellchunked.len());
251 let mut uncompacted_indexes = Vec::with_capacity(cellchunked.len());
252
253 for cell in cellchunked.iter_indexes_validated().flatten().flatten() {
255 match cell.resolution().cmp(&target_resolution) {
256 Ordering::Less => {
257 for cell_child in cell.get_children(target_resolution)?.iter().filter(&filter) {
258 original_indexes.push(cell.h3index());
259 uncompacted_indexes.push(cell_child.h3index());
260 }
261 }
262 Ordering::Equal => {
263 if filter(&cell) {
264 original_indexes.push(cell.h3index());
265 uncompacted_indexes.push(cell.h3index());
266 }
267 }
268 Ordering::Greater => {
269 }
271 }
272 }
273
274 if original_indexes == uncompacted_indexes {
275 return Ok(df);
277 }
278
279 let df = df
280 .lazy()
281 .inner_join(
282 DataFrame::new(vec![
283 Series::new(cell_column_name, original_indexes),
284 Series::new(UNCOMPACT_JOIN_COL_NAME, uncompacted_indexes),
285 ])?
286 .lazy(),
287 col(cell_column_name),
288 col(cell_column_name),
289 )
290 .drop_columns([cell_column_name])
291 .rename([UNCOMPACT_JOIN_COL_NAME], [cell_column_name])
292 .collect()?;
293
294 Ok(df)
295}
296
297#[cfg(test)]
298mod tests {
299 use crate::algorithm::chunkedarray::H3Resolution;
300 use crate::algorithm::frame::{H3CompactDataframe, H3UncompactDataframe};
301 use crate::algorithm::tests::make_cell_dataframe;
302 use crate::AsH3CellChunked;
303 use crate::NamedFromIndexes;
304 use h3ron::{H3Cell, HasH3Resolution};
305 use polars::prelude::{DataFrame, DataType, Series};
306
307 const CELL_COL_NAME: &str = "cell";
308
309 fn compact_roundtrip_helper(value: Option<u32>) {
310 let max_res = 8;
311 let df = make_cell_dataframe(CELL_COL_NAME, max_res, value).unwrap();
312 let shape_before = df.shape();
313
314 let compacted = df.h3_compact_dataframe(CELL_COL_NAME, true).unwrap();
315
316 assert!(shape_before.0 > compacted.shape().0);
317 assert_eq!(shape_before.1, compacted.shape().1);
318 assert_eq!(
319 compacted.column(CELL_COL_NAME).unwrap().dtype(),
320 &DataType::UInt64
321 );
322
323 let compacted_resolutions = compacted
324 .column(CELL_COL_NAME)
325 .unwrap()
326 .u64()
327 .unwrap()
328 .h3cell()
329 .h3_resolution();
330 assert!(compacted_resolutions.len() > 1);
331 for res in &compacted_resolutions {
332 assert!(res.unwrap() <= max_res);
333 }
334
335 let uncompacted = compacted
336 .h3_uncompact_dataframe(CELL_COL_NAME, max_res)
337 .unwrap();
338 assert_eq!(uncompacted.shape(), shape_before);
339 assert_eq!(
340 uncompacted.column(CELL_COL_NAME).unwrap().dtype(),
341 &DataType::UInt64
342 );
343
344 let resolutions = uncompacted
345 .column(CELL_COL_NAME)
346 .unwrap()
347 .u64()
348 .unwrap()
349 .h3cell()
350 .h3_resolution();
351 assert_eq!(uncompacted.shape().0, resolutions.len());
352 for res in &resolutions {
353 assert_eq!(res.unwrap(), max_res);
354 }
355 }
356
357 #[test]
358 fn compact_roundtrip_with_value() {
359 compact_roundtrip_helper(Some(7))
360 }
361
362 #[test]
363 fn compact_roundtrip_without_value() {
364 compact_roundtrip_helper(None)
365 }
366
367 #[test]
368 fn uncompact_subset() {
369 let origin_cell = H3Cell::from_coordinate((12.0, 12.0).into(), 5).unwrap();
370
371 let df = DataFrame::new(vec![Series::new_from_indexes(
372 CELL_COL_NAME,
373 origin_cell
374 .grid_disk(12)
375 .unwrap()
376 .iter()
377 .collect::<Vec<_>>(),
378 )])
379 .unwrap();
380
381 let subset_origin = origin_cell.center_child(7).unwrap();
382 let subset = {
383 let mut subset = subset_origin
384 .grid_disk(1)
385 .unwrap()
386 .iter()
387 .collect::<Vec<_>>();
388 subset.sort_unstable();
389 subset
390 };
391
392 let subset_df = df
393 .h3_uncompact_dataframe_subset_iter(
394 CELL_COL_NAME,
395 subset_origin.h3_resolution(),
396 subset.as_slice(),
397 )
398 .unwrap();
399 assert_eq!(subset_df.shape().0, subset.len());
400
401 let subset_from_subset_df = {
402 let mut sbs = subset_df
403 .column(CELL_COL_NAME)
404 .unwrap()
405 .u64()
406 .unwrap()
407 .h3cell()
408 .iter_indexes_validated()
409 .flatten()
410 .collect::<Result<Vec<_>, _>>()
411 .unwrap();
412 sbs.sort();
413 sbs
414 };
415 assert_eq!(subset, subset_from_subset_df);
416 }
417}