Skip to main content

sedona_expr/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use std::str::FromStr;
18
19use datafusion_common::{stats::Precision, ColumnStatistics, DataFusionError, Result, ScalarValue};
20use sedona_common::sedona_internal_datafusion_err;
21use sedona_geometry::interval::{Interval, IntervalTrait};
22use sedona_geometry::{
23    bounding_box::BoundingBox,
24    types::{GeometryTypeAndDimensions, GeometryTypeAndDimensionsSet},
25};
26use serde::{Deserialize, Serialize};
27
28/// Statistics specific to spatial data types
29///
30/// These statistics are an abstraction to provide sedonadb the ability to
31/// perform generic pruning and optimization for datasources that have the
32/// ability to provide this information. This may evolve to support more
33/// fields; however, can currently express Parquet built-in GeoStatistics,
34/// GeoParquet metadata, and GDAL OGR (via GetExtent() and GetGeomType()).
35/// This struct can also represent partial or missing information.
36#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
37pub struct GeoStatistics {
38    // Core spatial statistics for pruning
39    bbox: Option<BoundingBox>, // The overall bounding box (min/max coordinates) containing all geometries
40    geometry_types: Option<GeometryTypeAndDimensionsSet>, // Set of all geometry types and dimensions present
41
42    // Extended statistics for analysis
43    total_geometries: Option<i64>, // Total count of all geometries
44    total_size_bytes: Option<i64>, // Total size of all geometries in bytes
45    total_points: Option<i64>,     // Total number of points/vertices across all geometries
46
47    // Type distribution counts
48    puntal_count: Option<i64>,     // Count of point-type geometries
49    lineal_count: Option<i64>,     // Count of line-type geometries
50    polygonal_count: Option<i64>,  // Count of polygon-type geometries
51    collection_count: Option<i64>, // Count of geometry collections
52
53    // Envelope dimensions statistics
54    total_envelope_width: Option<f64>, // Sum of all envelope widths (for calculating mean width)
55    total_envelope_height: Option<f64>, // Sum of all envelope heights (for calculating mean height)
56}
57
58impl GeoStatistics {
59    /// Statistics representing unspecified information
60    pub const UNSPECIFIED: GeoStatistics = Self {
61        bbox: None,
62        geometry_types: None,
63        total_geometries: None,
64        total_size_bytes: None,
65        total_points: None,
66        puntal_count: None,
67        lineal_count: None,
68        polygonal_count: None,
69        collection_count: None,
70        total_envelope_width: None,
71        total_envelope_height: None,
72    };
73
74    /// Create statistics representing unspecified information
75    pub fn unspecified() -> Self {
76        Self::UNSPECIFIED.clone()
77    }
78
79    /// Create statistics representing empty information (with zero values instead of None)
80    pub fn empty() -> Self {
81        Self {
82            bbox: Some(BoundingBox::xy(Interval::empty(), Interval::empty())),
83            geometry_types: Some(GeometryTypeAndDimensionsSet::new()), // Empty set of geometry types
84            total_geometries: Some(0),                                 // Zero geometries
85            total_size_bytes: Some(0),                                 // Zero bytes
86            total_points: Some(0),                                     // Zero points
87            puntal_count: Some(0),                                     // Zero point geometries
88            lineal_count: Some(0),                                     // Zero line geometries
89            polygonal_count: Some(0),                                  // Zero polygon geometries
90            collection_count: Some(0),                                 // Zero collection geometries
91            total_envelope_width: Some(0.0),                           // Zero width
92            total_envelope_height: Some(0.0),                          // Zero height
93        }
94    }
95
96    /// Update the bounding box and return self
97    pub fn with_bbox(self, bbox: Option<BoundingBox>) -> Self {
98        Self { bbox, ..self }
99    }
100
101    /// Update the geometry types and return self
102    pub fn with_geometry_types(self, types: Option<GeometryTypeAndDimensionsSet>) -> Self {
103        Self {
104            geometry_types: types,
105            ..self
106        }
107    }
108
109    /// Get the bounding box if available
110    pub fn bbox(&self) -> Option<&BoundingBox> {
111        self.bbox.as_ref()
112    }
113
114    /// Get the geometry types if available
115    pub fn geometry_types(&self) -> Option<&GeometryTypeAndDimensionsSet> {
116        self.geometry_types.as_ref()
117    }
118
119    /// Get the total number of geometries if available
120    pub fn total_geometries(&self) -> Option<i64> {
121        self.total_geometries
122    }
123
124    /// Get the total size in bytes if available
125    pub fn total_size_bytes(&self) -> Option<i64> {
126        self.total_size_bytes
127    }
128
129    /// Get the total number of points if available
130    pub fn total_points(&self) -> Option<i64> {
131        self.total_points
132    }
133
134    /// Get the count of puntal geometries if available
135    pub fn puntal_count(&self) -> Option<i64> {
136        self.puntal_count
137    }
138
139    /// Get the count of lineal geometries if available
140    pub fn lineal_count(&self) -> Option<i64> {
141        self.lineal_count
142    }
143
144    /// Get the count of polygonal geometries if available
145    pub fn polygonal_count(&self) -> Option<i64> {
146        self.polygonal_count
147    }
148
149    /// Get the count of geometry collections if available
150    pub fn collection_count(&self) -> Option<i64> {
151        self.collection_count
152    }
153
154    /// Get the total envelope width if available
155    pub fn total_envelope_width(&self) -> Option<f64> {
156        self.total_envelope_width
157    }
158
159    /// Get the total envelope height if available
160    pub fn total_envelope_height(&self) -> Option<f64> {
161        self.total_envelope_height
162    }
163
164    /// Calculate the mean envelope width if possible
165    pub fn mean_envelope_width(&self) -> Option<f64> {
166        match (self.total_envelope_width, self.total_geometries) {
167            (Some(width), Some(count)) if count > 0 => Some(width / count as f64),
168            _ => None,
169        }
170    }
171
172    /// Calculate the mean envelope height if possible
173    pub fn mean_envelope_height(&self) -> Option<f64> {
174        match (self.total_envelope_height, self.total_geometries) {
175            (Some(height), Some(count)) if count > 0 => Some(height / count as f64),
176            _ => None,
177        }
178    }
179
180    /// Calculate the mean envelope area if possible
181    pub fn mean_envelope_area(&self) -> Option<f64> {
182        match (self.mean_envelope_width(), self.mean_envelope_height()) {
183            (Some(width), Some(height)) => Some(width * height),
184            _ => None,
185        }
186    }
187
188    /// Calculate the mean size in bytes if possible
189    pub fn mean_size_bytes(&self) -> Option<f64> {
190        match (self.total_size_bytes, self.total_geometries) {
191            (Some(bytes), Some(count)) if count > 0 => Some(bytes as f64 / count as f64),
192            _ => None,
193        }
194    }
195
196    /// Calculate the mean points per geometry if possible
197    pub fn mean_points_per_geometry(&self) -> Option<f64> {
198        match (self.total_points, self.total_geometries) {
199            (Some(points), Some(count)) if count > 0 => Some(points as f64 / count as f64),
200            _ => None,
201        }
202    }
203
204    /// Update the total geometries count and return self
205    pub fn with_total_geometries(self, count: i64) -> Self {
206        Self {
207            total_geometries: Some(count),
208            ..self
209        }
210    }
211
212    /// Update the total size in bytes and return self
213    pub fn with_total_size_bytes(self, bytes: i64) -> Self {
214        Self {
215            total_size_bytes: Some(bytes),
216            ..self
217        }
218    }
219
220    /// Update the total points count and return self
221    pub fn with_total_points(self, points: i64) -> Self {
222        Self {
223            total_points: Some(points),
224            ..self
225        }
226    }
227
228    /// Update the puntal geometries count and return self
229    pub fn with_puntal_count(self, count: i64) -> Self {
230        Self {
231            puntal_count: Some(count),
232            ..self
233        }
234    }
235
236    /// Update the lineal geometries count and return self
237    pub fn with_lineal_count(self, count: i64) -> Self {
238        Self {
239            lineal_count: Some(count),
240            ..self
241        }
242    }
243
244    /// Update the polygonal geometries count and return self
245    pub fn with_polygonal_count(self, count: i64) -> Self {
246        Self {
247            polygonal_count: Some(count),
248            ..self
249        }
250    }
251
252    /// Update the collection geometries count and return self
253    pub fn with_collection_count(self, count: i64) -> Self {
254        Self {
255            collection_count: Some(count),
256            ..self
257        }
258    }
259
260    /// Update the total envelope width and return self
261    pub fn with_total_envelope_width(self, width: f64) -> Self {
262        Self {
263            total_envelope_width: Some(width),
264            ..self
265        }
266    }
267
268    /// Update the total envelope height and return self
269    pub fn with_total_envelope_height(self, height: f64) -> Self {
270        Self {
271            total_envelope_height: Some(height),
272            ..self
273        }
274    }
275
276    /// Update this statistics object with another one
277    pub fn merge(&mut self, other: &Self) {
278        // Merge bounding boxes
279        if let Some(other_bbox) = &other.bbox {
280            match &mut self.bbox {
281                Some(bbox) => bbox.update_box(other_bbox),
282                None => self.bbox = Some(other_bbox.clone()),
283            }
284        }
285
286        // Merge geometry types
287        if let Some(other_types) = &other.geometry_types {
288            match &mut self.geometry_types {
289                Some(types) => {
290                    types.merge(other_types);
291                }
292                None => self.geometry_types = Some(other_types.clone()),
293            }
294        }
295
296        // Merge counts and totals
297        self.total_geometries =
298            Self::merge_option_add(self.total_geometries, other.total_geometries);
299        self.total_size_bytes =
300            Self::merge_option_add(self.total_size_bytes, other.total_size_bytes);
301        self.total_points = Self::merge_option_add(self.total_points, other.total_points);
302
303        // Merge type counts
304        self.puntal_count = Self::merge_option_add(self.puntal_count, other.puntal_count);
305        self.lineal_count = Self::merge_option_add(self.lineal_count, other.lineal_count);
306        self.polygonal_count = Self::merge_option_add(self.polygonal_count, other.polygonal_count);
307        self.collection_count =
308            Self::merge_option_add(self.collection_count, other.collection_count);
309
310        // Merge envelope dimensions
311        self.total_envelope_width =
312            Self::merge_option_add_f64(self.total_envelope_width, other.total_envelope_width);
313        self.total_envelope_height =
314            Self::merge_option_add_f64(self.total_envelope_height, other.total_envelope_height);
315    }
316
317    // Helper to merge two optional integers with addition
318    fn merge_option_add(a: Option<i64>, b: Option<i64>) -> Option<i64> {
319        match (a, b) {
320            (Some(a_val), Some(b_val)) => Some(a_val + b_val),
321            _ => None,
322        }
323    }
324
325    // Helper to merge two optional floats with addition
326    fn merge_option_add_f64(a: Option<f64>, b: Option<f64>) -> Option<f64> {
327        match (a, b) {
328            (Some(a_val), Some(b_val)) => Some(a_val + b_val),
329            _ => None,
330        }
331    }
332
333    /// Try to deserialize GeoStatistics from DataFusion [ColumnStatistics]
334    ///
335    /// Various DataFusion APIs operate on [ColumnStatistics], which do not support
336    /// spatial statistics natively. This function attempts to reconstruct an object
337    /// that was canonically serialized into one of these objects for transport through
338    /// DataFusion internals.
339    pub fn try_from_column_statistics(stats: &ColumnStatistics) -> Result<Option<Self>> {
340        let scalar = match &stats.sum_value {
341            Precision::Exact(value) => value,
342            _ => {
343                return Ok(None);
344            }
345        };
346
347        if let ScalarValue::Binary(Some(serialized)) = scalar {
348            serde_json::from_slice(serialized).map_err(|e| DataFusionError::External(Box::new(e)))
349        } else {
350            Ok(None)
351        }
352    }
353
354    /// Serialize this object into a [ColumnStatistics]
355    ///
356    /// Canonically place this object into a [ColumnStatistics] for transport through
357    /// DataFusion APIs.
358    pub fn to_column_statistics(&self) -> Result<ColumnStatistics> {
359        let serialized =
360            serde_json::to_vec(self).map_err(|e| DataFusionError::External(Box::new(e)))?;
361        Ok(ColumnStatistics::new_unknown()
362            .with_sum_value(Precision::Exact(ScalarValue::Binary(Some(serialized)))))
363    }
364
365    /// Add a definitive list of geometry type/dimension values specified as strings
366    ///
367    /// This accepts a list of strings like "point z" or "polygon zm". These strings
368    /// are case insensitive.
369    pub fn try_with_str_geometry_types(self, geometry_types: Option<&[&str]>) -> Result<Self> {
370        match geometry_types {
371            Some(strings) => {
372                let mut new_geometry_types = GeometryTypeAndDimensionsSet::new();
373                for string in strings {
374                    let type_and_dim = GeometryTypeAndDimensions::from_str(string)
375                        .map_err(|e| DataFusionError::External(Box::new(e)))?;
376                    new_geometry_types.insert_or_ignore(&type_and_dim);
377                }
378
379                Ok(Self {
380                    geometry_types: Some(new_geometry_types),
381                    ..self
382                })
383            }
384            None => Ok(Self {
385                geometry_types: None,
386                ..self
387            }),
388        }
389    }
390
391    /// Convert this GeoStatistics to a ScalarValue for storage in DataFusion statistics
392    pub fn to_scalar_value(&self) -> Result<ScalarValue> {
393        // Serialize to JSON
394        let serialized = serde_json::to_vec(self).map_err(|e| {
395            sedona_internal_datafusion_err!("Failed to serialize GeoStatistics: {e}")
396        })?;
397
398        Ok(ScalarValue::Binary(Some(serialized)))
399    }
400}
401
402#[cfg(test)]
403mod test {
404    use geo_traits::Dimensions;
405    use sedona_geometry::types::GeometryTypeId;
406
407    use super::*;
408
409    #[test]
410    fn unspecified() {
411        let stats = GeoStatistics::unspecified();
412        assert_eq!(stats.bbox(), None);
413        assert_eq!(stats.geometry_types(), None);
414        assert_eq!(stats.total_geometries(), None);
415        assert_eq!(stats.total_size_bytes(), None);
416        assert_eq!(stats.total_points(), None);
417        assert_eq!(stats.puntal_count(), None);
418        assert_eq!(stats.lineal_count(), None);
419        assert_eq!(stats.polygonal_count(), None);
420        assert_eq!(stats.collection_count(), None);
421        assert_eq!(stats.total_envelope_width(), None);
422        assert_eq!(stats.total_envelope_height(), None);
423
424        let regular_stats = stats.to_column_statistics().unwrap();
425        assert_eq!(
426            GeoStatistics::try_from_column_statistics(&regular_stats)
427                .unwrap()
428                .unwrap(),
429            stats
430        );
431    }
432
433    #[test]
434    fn specified_bbox() {
435        let bbox = BoundingBox::xy((0.0, 1.0), (2.0, 3.0));
436        // Test with_bbox
437        let stats = GeoStatistics::empty().with_bbox(Some(bbox.clone()));
438        assert_eq!(stats.bbox(), Some(&bbox));
439        assert_eq!(
440            stats.geometry_types(),
441            Some(&GeometryTypeAndDimensionsSet::new())
442        );
443
444        let regular_stats = stats.to_column_statistics().unwrap();
445        assert_eq!(
446            GeoStatistics::try_from_column_statistics(&regular_stats)
447                .unwrap()
448                .unwrap(),
449            stats
450        );
451
452        // Test with None
453        let stats_with_none = GeoStatistics::empty().with_bbox(None);
454        assert_eq!(stats_with_none.bbox(), None);
455    }
456
457    #[test]
458    fn specified_geometry_types() {
459        let mut types = GeometryTypeAndDimensionsSet::new();
460        types
461            .insert(&GeometryTypeAndDimensions::new(
462                GeometryTypeId::Polygon,
463                Dimensions::Xy,
464            ))
465            .unwrap();
466
467        // Test with_geometry_types
468        let stats = GeoStatistics::empty().with_geometry_types(Some(types.clone()));
469        assert_eq!(stats.geometry_types(), Some(&types));
470        assert_eq!(
471            stats.bbox(),
472            Some(&BoundingBox::xy(Interval::empty(), Interval::empty()))
473        );
474
475        let regular_stats = stats.to_column_statistics().unwrap();
476        assert_eq!(
477            GeoStatistics::try_from_column_statistics(&regular_stats)
478                .unwrap()
479                .unwrap(),
480            stats
481        );
482
483        // Test with None
484        let stats_with_none = GeoStatistics::empty().with_geometry_types(None);
485        assert_eq!(stats_with_none.geometry_types(), None);
486    }
487
488    #[test]
489    fn specified_geometry_types_by_name() {
490        // Test try_with_str_geometry_types
491        let stats = GeoStatistics::empty()
492            .try_with_str_geometry_types(Some(&["polygon", "point"]))
493            .unwrap();
494
495        let mut expected_types = GeometryTypeAndDimensionsSet::new();
496        expected_types
497            .insert(&GeometryTypeAndDimensions::new(
498                GeometryTypeId::Polygon,
499                Dimensions::Xy,
500            ))
501            .unwrap();
502        expected_types
503            .insert(&GeometryTypeAndDimensions::new(
504                GeometryTypeId::Point,
505                Dimensions::Xy,
506            ))
507            .unwrap();
508
509        assert_eq!(stats.geometry_types(), Some(&expected_types));
510        assert_eq!(
511            stats.bbox(),
512            Some(&BoundingBox::xy(Interval::empty(), Interval::empty()))
513        );
514
515        // Test serialization
516        let regular_stats = stats.to_column_statistics().unwrap();
517        assert_eq!(
518            GeoStatistics::try_from_column_statistics(&regular_stats)
519                .unwrap()
520                .unwrap(),
521            stats
522        );
523    }
524
525    #[test]
526    fn from_non_geometry_stats() {
527        // Can't make geo stats from unknown
528        let stats = ColumnStatistics::new_unknown();
529        assert!(GeoStatistics::try_from_column_statistics(&stats)
530            .unwrap()
531            .is_none());
532
533        // Can't make geo stats from binary null
534        let stats = ColumnStatistics::new_unknown()
535            .with_sum_value(Precision::Exact(ScalarValue::Binary(None)));
536        assert!(GeoStatistics::try_from_column_statistics(&stats)
537            .unwrap()
538            .is_none());
539
540        // Can't make geo stats from binary null
541        let stats = ColumnStatistics::new_unknown()
542            .with_sum_value(Precision::Exact(ScalarValue::Binary(Some(vec![]))));
543        let err = GeoStatistics::try_from_column_statistics(&stats).unwrap_err();
544        assert_eq!(
545            err.message(),
546            "EOF while parsing a value at line 1 column 0"
547        )
548    }
549
550    #[test]
551    fn test_extended_stats() {
552        // Use fluent API with with_* methods
553        let stats = GeoStatistics::empty()
554            .with_total_geometries(100)
555            .with_total_size_bytes(10000)
556            .with_total_points(5000)
557            .with_puntal_count(20)
558            .with_lineal_count(30)
559            .with_polygonal_count(40)
560            .with_collection_count(10)
561            .with_total_envelope_width(500.0)
562            .with_total_envelope_height(300.0);
563
564        // Test getters
565        assert_eq!(stats.total_geometries(), Some(100));
566        assert_eq!(stats.total_size_bytes(), Some(10000));
567        assert_eq!(stats.total_points(), Some(5000));
568        assert_eq!(stats.puntal_count(), Some(20));
569        assert_eq!(stats.lineal_count(), Some(30));
570        assert_eq!(stats.polygonal_count(), Some(40));
571        assert_eq!(stats.collection_count(), Some(10));
572        assert_eq!(stats.total_envelope_width(), Some(500.0));
573        assert_eq!(stats.total_envelope_height(), Some(300.0));
574
575        // Test derived statistics
576        assert_eq!(stats.mean_size_bytes(), Some(100.0));
577        assert_eq!(stats.mean_points_per_geometry(), Some(50.0));
578        assert_eq!(stats.mean_envelope_width(), Some(5.0));
579        assert_eq!(stats.mean_envelope_height(), Some(3.0));
580        assert_eq!(stats.mean_envelope_area(), Some(15.0));
581
582        // Test serialization/deserialization via column statistics
583        let column_stats = stats.to_column_statistics().unwrap();
584        let deserialized = GeoStatistics::try_from_column_statistics(&column_stats)
585            .unwrap()
586            .unwrap();
587        assert_eq!(deserialized, stats);
588    }
589
590    #[test]
591    fn test_merge_extended_stats() {
592        // Create statistics objects using fluent API
593        let stats1 = GeoStatistics::empty()
594            .with_total_geometries(50)
595            .with_total_size_bytes(5000)
596            .with_total_points(2500)
597            .with_puntal_count(10)
598            .with_lineal_count(15)
599            .with_polygonal_count(20)
600            .with_collection_count(5)
601            .with_total_envelope_width(250.0)
602            .with_total_envelope_height(150.0);
603
604        let stats2 = GeoStatistics::empty()
605            .with_total_geometries(50)
606            .with_total_size_bytes(5000)
607            .with_total_points(2500)
608            .with_puntal_count(10)
609            .with_lineal_count(15)
610            .with_polygonal_count(20)
611            .with_collection_count(5)
612            .with_total_envelope_width(250.0)
613            .with_total_envelope_height(150.0);
614
615        // Now merge them
616        let mut merged = stats1.clone();
617        merged.merge(&stats2);
618
619        // Check merged results
620        assert_eq!(merged.total_geometries(), Some(100));
621        assert_eq!(merged.total_size_bytes(), Some(10000));
622        assert_eq!(merged.total_points(), Some(5000));
623        assert_eq!(merged.puntal_count(), Some(20));
624        assert_eq!(merged.lineal_count(), Some(30));
625        assert_eq!(merged.polygonal_count(), Some(40));
626        assert_eq!(merged.collection_count(), Some(10));
627        assert_eq!(merged.total_envelope_width(), Some(500.0));
628        assert_eq!(merged.total_envelope_height(), Some(300.0));
629
630        // Test serialization/deserialization of merged stats
631        let column_stats = merged.to_column_statistics().unwrap();
632        let deserialized = GeoStatistics::try_from_column_statistics(&column_stats)
633            .unwrap()
634            .unwrap();
635        assert_eq!(deserialized, merged);
636    }
637
638    #[test]
639    fn test_partial_merge() {
640        let stats1 = GeoStatistics::empty()
641            .with_total_geometries(50)
642            .with_total_size_bytes(5000);
643
644        let stats2 = GeoStatistics::empty()
645            .with_puntal_count(20)
646            .with_lineal_count(30);
647
648        let mut merged = stats1.clone();
649        merged.merge(&stats2);
650
651        // Check merged results
652        assert_eq!(merged.total_geometries(), Some(50));
653        assert_eq!(merged.total_size_bytes(), Some(5000));
654        assert_eq!(merged.puntal_count(), Some(20));
655        assert_eq!(merged.lineal_count(), Some(30));
656        assert_eq!(merged.polygonal_count(), Some(0));
657
658        // Test serialization/deserialization of partially merged stats
659        let column_stats = merged.to_column_statistics().unwrap();
660        let deserialized = GeoStatistics::try_from_column_statistics(&column_stats)
661            .unwrap()
662            .unwrap();
663        assert_eq!(deserialized, merged);
664    }
665}