sedona_expr/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17use std::str::FromStr;
18
19use datafusion_common::{stats::Precision, ColumnStatistics, DataFusionError, Result, ScalarValue};
20use sedona_geometry::interval::{Interval, IntervalTrait};
21use sedona_geometry::{
22    bounding_box::BoundingBox,
23    types::{GeometryTypeAndDimensions, GeometryTypeAndDimensionsSet},
24};
25use serde::{Deserialize, Serialize};
26
27/// Statistics specific to spatial data types
28///
29/// These statistics are an abstraction to provide sedonadb the ability to
30/// perform generic pruning and optimization for datasources that have the
31/// ability to provide this information. This may evolve to support more
32/// fields; however, can currently express Parquet built-in GeoStatistics,
33/// GeoParquet metadata, and GDAL OGR (via GetExtent() and GetGeomType()).
34/// This struct can also represent partial or missing information.
35#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
36pub struct GeoStatistics {
37    // Core spatial statistics for pruning
38    bbox: Option<BoundingBox>, // The overall bounding box (min/max coordinates) containing all geometries
39    geometry_types: Option<GeometryTypeAndDimensionsSet>, // Set of all geometry types and dimensions present
40
41    // Extended statistics for analysis
42    total_geometries: Option<i64>, // Total count of all geometries
43    total_size_bytes: Option<i64>, // Total size of all geometries in bytes
44    total_points: Option<i64>,     // Total number of points/vertices across all geometries
45
46    // Type distribution counts
47    puntal_count: Option<i64>,     // Count of point-type geometries
48    lineal_count: Option<i64>,     // Count of line-type geometries
49    polygonal_count: Option<i64>,  // Count of polygon-type geometries
50    collection_count: Option<i64>, // Count of geometry collections
51
52    // Envelope dimensions statistics
53    total_envelope_width: Option<f64>, // Sum of all envelope widths (for calculating mean width)
54    total_envelope_height: Option<f64>, // Sum of all envelope heights (for calculating mean height)
55}
56
57impl GeoStatistics {
58    /// Statistics representing unspecified information
59    pub const UNSPECIFIED: GeoStatistics = Self {
60        bbox: None,
61        geometry_types: None,
62        total_geometries: None,
63        total_size_bytes: None,
64        total_points: None,
65        puntal_count: None,
66        lineal_count: None,
67        polygonal_count: None,
68        collection_count: None,
69        total_envelope_width: None,
70        total_envelope_height: None,
71    };
72
73    /// Create statistics representing unspecified information
74    pub fn unspecified() -> Self {
75        Self::UNSPECIFIED.clone()
76    }
77
78    /// Create statistics representing empty information (with zero values instead of None)
79    pub fn empty() -> Self {
80        Self {
81            bbox: Some(BoundingBox::xy(Interval::empty(), Interval::empty())),
82            geometry_types: Some(GeometryTypeAndDimensionsSet::new()), // Empty set of geometry types
83            total_geometries: Some(0),                                 // Zero geometries
84            total_size_bytes: Some(0),                                 // Zero bytes
85            total_points: Some(0),                                     // Zero points
86            puntal_count: Some(0),                                     // Zero point geometries
87            lineal_count: Some(0),                                     // Zero line geometries
88            polygonal_count: Some(0),                                  // Zero polygon geometries
89            collection_count: Some(0),                                 // Zero collection geometries
90            total_envelope_width: Some(0.0),                           // Zero width
91            total_envelope_height: Some(0.0),                          // Zero height
92        }
93    }
94
95    /// Update the bounding box and return self
96    pub fn with_bbox(self, bbox: Option<BoundingBox>) -> Self {
97        Self { bbox, ..self }
98    }
99
100    /// Update the geometry types and return self
101    pub fn with_geometry_types(self, types: Option<GeometryTypeAndDimensionsSet>) -> Self {
102        Self {
103            geometry_types: types,
104            ..self
105        }
106    }
107
108    /// Get the bounding box if available
109    pub fn bbox(&self) -> Option<&BoundingBox> {
110        self.bbox.as_ref()
111    }
112
113    /// Get the geometry types if available
114    pub fn geometry_types(&self) -> Option<&GeometryTypeAndDimensionsSet> {
115        self.geometry_types.as_ref()
116    }
117
118    /// Get the total number of geometries if available
119    pub fn total_geometries(&self) -> Option<i64> {
120        self.total_geometries
121    }
122
123    /// Get the total size in bytes if available
124    pub fn total_size_bytes(&self) -> Option<i64> {
125        self.total_size_bytes
126    }
127
128    /// Get the total number of points if available
129    pub fn total_points(&self) -> Option<i64> {
130        self.total_points
131    }
132
133    /// Get the count of puntal geometries if available
134    pub fn puntal_count(&self) -> Option<i64> {
135        self.puntal_count
136    }
137
138    /// Get the count of lineal geometries if available
139    pub fn lineal_count(&self) -> Option<i64> {
140        self.lineal_count
141    }
142
143    /// Get the count of polygonal geometries if available
144    pub fn polygonal_count(&self) -> Option<i64> {
145        self.polygonal_count
146    }
147
148    /// Get the count of geometry collections if available
149    pub fn collection_count(&self) -> Option<i64> {
150        self.collection_count
151    }
152
153    /// Get the total envelope width if available
154    pub fn total_envelope_width(&self) -> Option<f64> {
155        self.total_envelope_width
156    }
157
158    /// Get the total envelope height if available
159    pub fn total_envelope_height(&self) -> Option<f64> {
160        self.total_envelope_height
161    }
162
163    /// Calculate the mean envelope width if possible
164    pub fn mean_envelope_width(&self) -> Option<f64> {
165        match (self.total_envelope_width, self.total_geometries) {
166            (Some(width), Some(count)) if count > 0 => Some(width / count as f64),
167            _ => None,
168        }
169    }
170
171    /// Calculate the mean envelope height if possible
172    pub fn mean_envelope_height(&self) -> Option<f64> {
173        match (self.total_envelope_height, self.total_geometries) {
174            (Some(height), Some(count)) if count > 0 => Some(height / count as f64),
175            _ => None,
176        }
177    }
178
179    /// Calculate the mean envelope area if possible
180    pub fn mean_envelope_area(&self) -> Option<f64> {
181        match (self.mean_envelope_width(), self.mean_envelope_height()) {
182            (Some(width), Some(height)) => Some(width * height),
183            _ => None,
184        }
185    }
186
187    /// Calculate the mean size in bytes if possible
188    pub fn mean_size_bytes(&self) -> Option<f64> {
189        match (self.total_size_bytes, self.total_geometries) {
190            (Some(bytes), Some(count)) if count > 0 => Some(bytes as f64 / count as f64),
191            _ => None,
192        }
193    }
194
195    /// Calculate the mean points per geometry if possible
196    pub fn mean_points_per_geometry(&self) -> Option<f64> {
197        match (self.total_points, self.total_geometries) {
198            (Some(points), Some(count)) if count > 0 => Some(points as f64 / count as f64),
199            _ => None,
200        }
201    }
202
203    /// Update the total geometries count and return self
204    pub fn with_total_geometries(self, count: i64) -> Self {
205        Self {
206            total_geometries: Some(count),
207            ..self
208        }
209    }
210
211    /// Update the total size in bytes and return self
212    pub fn with_total_size_bytes(self, bytes: i64) -> Self {
213        Self {
214            total_size_bytes: Some(bytes),
215            ..self
216        }
217    }
218
219    /// Update the total points count and return self
220    pub fn with_total_points(self, points: i64) -> Self {
221        Self {
222            total_points: Some(points),
223            ..self
224        }
225    }
226
227    /// Update the puntal geometries count and return self
228    pub fn with_puntal_count(self, count: i64) -> Self {
229        Self {
230            puntal_count: Some(count),
231            ..self
232        }
233    }
234
235    /// Update the lineal geometries count and return self
236    pub fn with_lineal_count(self, count: i64) -> Self {
237        Self {
238            lineal_count: Some(count),
239            ..self
240        }
241    }
242
243    /// Update the polygonal geometries count and return self
244    pub fn with_polygonal_count(self, count: i64) -> Self {
245        Self {
246            polygonal_count: Some(count),
247            ..self
248        }
249    }
250
251    /// Update the collection geometries count and return self
252    pub fn with_collection_count(self, count: i64) -> Self {
253        Self {
254            collection_count: Some(count),
255            ..self
256        }
257    }
258
259    /// Update the total envelope width and return self
260    pub fn with_total_envelope_width(self, width: f64) -> Self {
261        Self {
262            total_envelope_width: Some(width),
263            ..self
264        }
265    }
266
267    /// Update the total envelope height and return self
268    pub fn with_total_envelope_height(self, height: f64) -> Self {
269        Self {
270            total_envelope_height: Some(height),
271            ..self
272        }
273    }
274
275    /// Update this statistics object with another one
276    pub fn merge(&mut self, other: &Self) {
277        // Merge bounding boxes
278        if let Some(other_bbox) = &other.bbox {
279            match &mut self.bbox {
280                Some(bbox) => bbox.update_box(other_bbox),
281                None => self.bbox = Some(other_bbox.clone()),
282            }
283        }
284
285        // Merge geometry types
286        if let Some(other_types) = &other.geometry_types {
287            match &mut self.geometry_types {
288                Some(types) => {
289                    types.merge(other_types);
290                }
291                None => self.geometry_types = Some(other_types.clone()),
292            }
293        }
294
295        // Merge counts and totals
296        self.total_geometries =
297            Self::merge_option_add(self.total_geometries, other.total_geometries);
298        self.total_size_bytes =
299            Self::merge_option_add(self.total_size_bytes, other.total_size_bytes);
300        self.total_points = Self::merge_option_add(self.total_points, other.total_points);
301
302        // Merge type counts
303        self.puntal_count = Self::merge_option_add(self.puntal_count, other.puntal_count);
304        self.lineal_count = Self::merge_option_add(self.lineal_count, other.lineal_count);
305        self.polygonal_count = Self::merge_option_add(self.polygonal_count, other.polygonal_count);
306        self.collection_count =
307            Self::merge_option_add(self.collection_count, other.collection_count);
308
309        // Merge envelope dimensions
310        self.total_envelope_width =
311            Self::merge_option_add_f64(self.total_envelope_width, other.total_envelope_width);
312        self.total_envelope_height =
313            Self::merge_option_add_f64(self.total_envelope_height, other.total_envelope_height);
314    }
315
316    // Helper to merge two optional integers with addition
317    fn merge_option_add(a: Option<i64>, b: Option<i64>) -> Option<i64> {
318        match (a, b) {
319            (Some(a_val), Some(b_val)) => Some(a_val + b_val),
320            _ => None,
321        }
322    }
323
324    // Helper to merge two optional floats with addition
325    fn merge_option_add_f64(a: Option<f64>, b: Option<f64>) -> Option<f64> {
326        match (a, b) {
327            (Some(a_val), Some(b_val)) => Some(a_val + b_val),
328            _ => None,
329        }
330    }
331
332    /// Try to deserialize GeoStatistics from DataFusion [ColumnStatistics]
333    ///
334    /// Various DataFusion APIs operate on [ColumnStatistics], which do not support
335    /// spatial statistics natively. This function attempts to reconstruct an object
336    /// that was canonically serialized into one of these objects for transport through
337    /// DataFusion internals.
338    pub fn try_from_column_statistics(stats: &ColumnStatistics) -> Result<Option<Self>> {
339        let scalar = match &stats.sum_value {
340            Precision::Exact(value) => value,
341            _ => {
342                return Ok(None);
343            }
344        };
345
346        if let ScalarValue::Binary(Some(serialized)) = scalar {
347            serde_json::from_slice(serialized).map_err(|e| DataFusionError::External(Box::new(e)))
348        } else {
349            Ok(None)
350        }
351    }
352
353    /// Serialize this object into a [ColumnStatistics]
354    ///
355    /// Canonically place this object into a [ColumnStatistics] for transport through
356    /// DataFusion APIs.
357    pub fn to_column_statistics(&self) -> Result<ColumnStatistics> {
358        let serialized =
359            serde_json::to_vec(self).map_err(|e| DataFusionError::External(Box::new(e)))?;
360        Ok(ColumnStatistics::new_unknown()
361            .with_sum_value(Precision::Exact(ScalarValue::Binary(Some(serialized)))))
362    }
363
364    /// Add a definitive list of geometry type/dimension values specified as strings
365    ///
366    /// This accepts a list of strings like "point z" or "polygon zm". These strings
367    /// are case insensitive.
368    pub fn try_with_str_geometry_types(self, geometry_types: Option<&[&str]>) -> Result<Self> {
369        match geometry_types {
370            Some(strings) => {
371                let mut new_geometry_types = GeometryTypeAndDimensionsSet::new();
372                for string in strings {
373                    let type_and_dim = GeometryTypeAndDimensions::from_str(string)
374                        .map_err(|e| DataFusionError::External(Box::new(e)))?;
375                    new_geometry_types.insert_or_ignore(&type_and_dim);
376                }
377
378                Ok(Self {
379                    geometry_types: Some(new_geometry_types),
380                    ..self
381                })
382            }
383            None => Ok(Self {
384                geometry_types: None,
385                ..self
386            }),
387        }
388    }
389
390    /// Convert this GeoStatistics to a ScalarValue for storage in DataFusion statistics
391    pub fn to_scalar_value(&self) -> Result<ScalarValue> {
392        // Serialize to JSON
393        let serialized = serde_json::to_vec(self).map_err(|e| {
394            DataFusionError::Internal(format!("Failed to serialize GeoStatistics: {e}"))
395        })?;
396
397        Ok(ScalarValue::Binary(Some(serialized)))
398    }
399}
400
401#[cfg(test)]
402mod test {
403    use geo_traits::Dimensions;
404    use sedona_geometry::types::GeometryTypeId;
405
406    use super::*;
407
408    #[test]
409    fn unspecified() {
410        let stats = GeoStatistics::unspecified();
411        assert_eq!(stats.bbox(), None);
412        assert_eq!(stats.geometry_types(), None);
413        assert_eq!(stats.total_geometries(), None);
414        assert_eq!(stats.total_size_bytes(), None);
415        assert_eq!(stats.total_points(), None);
416        assert_eq!(stats.puntal_count(), None);
417        assert_eq!(stats.lineal_count(), None);
418        assert_eq!(stats.polygonal_count(), None);
419        assert_eq!(stats.collection_count(), None);
420        assert_eq!(stats.total_envelope_width(), None);
421        assert_eq!(stats.total_envelope_height(), None);
422
423        let regular_stats = stats.to_column_statistics().unwrap();
424        assert_eq!(
425            GeoStatistics::try_from_column_statistics(&regular_stats)
426                .unwrap()
427                .unwrap(),
428            stats
429        );
430    }
431
432    #[test]
433    fn specified_bbox() {
434        let bbox = BoundingBox::xy((0.0, 1.0), (2.0, 3.0));
435        // Test with_bbox
436        let stats = GeoStatistics::empty().with_bbox(Some(bbox.clone()));
437        assert_eq!(stats.bbox(), Some(&bbox));
438        assert_eq!(
439            stats.geometry_types(),
440            Some(&GeometryTypeAndDimensionsSet::new())
441        );
442
443        let regular_stats = stats.to_column_statistics().unwrap();
444        assert_eq!(
445            GeoStatistics::try_from_column_statistics(&regular_stats)
446                .unwrap()
447                .unwrap(),
448            stats
449        );
450
451        // Test with None
452        let stats_with_none = GeoStatistics::empty().with_bbox(None);
453        assert_eq!(stats_with_none.bbox(), None);
454    }
455
456    #[test]
457    fn specified_geometry_types() {
458        let mut types = GeometryTypeAndDimensionsSet::new();
459        types
460            .insert(&GeometryTypeAndDimensions::new(
461                GeometryTypeId::Polygon,
462                Dimensions::Xy,
463            ))
464            .unwrap();
465
466        // Test with_geometry_types
467        let stats = GeoStatistics::empty().with_geometry_types(Some(types.clone()));
468        assert_eq!(stats.geometry_types(), Some(&types));
469        assert_eq!(
470            stats.bbox(),
471            Some(&BoundingBox::xy(Interval::empty(), Interval::empty()))
472        );
473
474        let regular_stats = stats.to_column_statistics().unwrap();
475        assert_eq!(
476            GeoStatistics::try_from_column_statistics(&regular_stats)
477                .unwrap()
478                .unwrap(),
479            stats
480        );
481
482        // Test with None
483        let stats_with_none = GeoStatistics::empty().with_geometry_types(None);
484        assert_eq!(stats_with_none.geometry_types(), None);
485    }
486
487    #[test]
488    fn specified_geometry_types_by_name() {
489        // Test try_with_str_geometry_types
490        let stats = GeoStatistics::empty()
491            .try_with_str_geometry_types(Some(&["polygon", "point"]))
492            .unwrap();
493
494        let mut expected_types = GeometryTypeAndDimensionsSet::new();
495        expected_types
496            .insert(&GeometryTypeAndDimensions::new(
497                GeometryTypeId::Polygon,
498                Dimensions::Xy,
499            ))
500            .unwrap();
501        expected_types
502            .insert(&GeometryTypeAndDimensions::new(
503                GeometryTypeId::Point,
504                Dimensions::Xy,
505            ))
506            .unwrap();
507
508        assert_eq!(stats.geometry_types(), Some(&expected_types));
509        assert_eq!(
510            stats.bbox(),
511            Some(&BoundingBox::xy(Interval::empty(), Interval::empty()))
512        );
513
514        // Test serialization
515        let regular_stats = stats.to_column_statistics().unwrap();
516        assert_eq!(
517            GeoStatistics::try_from_column_statistics(&regular_stats)
518                .unwrap()
519                .unwrap(),
520            stats
521        );
522    }
523
524    #[test]
525    fn from_non_geometry_stats() {
526        // Can't make geo stats from unknown
527        let stats = ColumnStatistics::new_unknown();
528        assert!(GeoStatistics::try_from_column_statistics(&stats)
529            .unwrap()
530            .is_none());
531
532        // Can't make geo stats from binary null
533        let stats = ColumnStatistics::new_unknown()
534            .with_sum_value(Precision::Exact(ScalarValue::Binary(None)));
535        assert!(GeoStatistics::try_from_column_statistics(&stats)
536            .unwrap()
537            .is_none());
538
539        // Can't make geo stats from binary null
540        let stats = ColumnStatistics::new_unknown()
541            .with_sum_value(Precision::Exact(ScalarValue::Binary(Some(vec![]))));
542        let err = GeoStatistics::try_from_column_statistics(&stats).unwrap_err();
543        assert_eq!(
544            err.message(),
545            "EOF while parsing a value at line 1 column 0"
546        )
547    }
548
549    #[test]
550    fn test_extended_stats() {
551        // Use fluent API with with_* methods
552        let stats = GeoStatistics::empty()
553            .with_total_geometries(100)
554            .with_total_size_bytes(10000)
555            .with_total_points(5000)
556            .with_puntal_count(20)
557            .with_lineal_count(30)
558            .with_polygonal_count(40)
559            .with_collection_count(10)
560            .with_total_envelope_width(500.0)
561            .with_total_envelope_height(300.0);
562
563        // Test getters
564        assert_eq!(stats.total_geometries(), Some(100));
565        assert_eq!(stats.total_size_bytes(), Some(10000));
566        assert_eq!(stats.total_points(), Some(5000));
567        assert_eq!(stats.puntal_count(), Some(20));
568        assert_eq!(stats.lineal_count(), Some(30));
569        assert_eq!(stats.polygonal_count(), Some(40));
570        assert_eq!(stats.collection_count(), Some(10));
571        assert_eq!(stats.total_envelope_width(), Some(500.0));
572        assert_eq!(stats.total_envelope_height(), Some(300.0));
573
574        // Test derived statistics
575        assert_eq!(stats.mean_size_bytes(), Some(100.0));
576        assert_eq!(stats.mean_points_per_geometry(), Some(50.0));
577        assert_eq!(stats.mean_envelope_width(), Some(5.0));
578        assert_eq!(stats.mean_envelope_height(), Some(3.0));
579        assert_eq!(stats.mean_envelope_area(), Some(15.0));
580
581        // Test serialization/deserialization via column statistics
582        let column_stats = stats.to_column_statistics().unwrap();
583        let deserialized = GeoStatistics::try_from_column_statistics(&column_stats)
584            .unwrap()
585            .unwrap();
586        assert_eq!(deserialized, stats);
587    }
588
589    #[test]
590    fn test_merge_extended_stats() {
591        // Create statistics objects using fluent API
592        let stats1 = GeoStatistics::empty()
593            .with_total_geometries(50)
594            .with_total_size_bytes(5000)
595            .with_total_points(2500)
596            .with_puntal_count(10)
597            .with_lineal_count(15)
598            .with_polygonal_count(20)
599            .with_collection_count(5)
600            .with_total_envelope_width(250.0)
601            .with_total_envelope_height(150.0);
602
603        let stats2 = GeoStatistics::empty()
604            .with_total_geometries(50)
605            .with_total_size_bytes(5000)
606            .with_total_points(2500)
607            .with_puntal_count(10)
608            .with_lineal_count(15)
609            .with_polygonal_count(20)
610            .with_collection_count(5)
611            .with_total_envelope_width(250.0)
612            .with_total_envelope_height(150.0);
613
614        // Now merge them
615        let mut merged = stats1.clone();
616        merged.merge(&stats2);
617
618        // Check merged results
619        assert_eq!(merged.total_geometries(), Some(100));
620        assert_eq!(merged.total_size_bytes(), Some(10000));
621        assert_eq!(merged.total_points(), Some(5000));
622        assert_eq!(merged.puntal_count(), Some(20));
623        assert_eq!(merged.lineal_count(), Some(30));
624        assert_eq!(merged.polygonal_count(), Some(40));
625        assert_eq!(merged.collection_count(), Some(10));
626        assert_eq!(merged.total_envelope_width(), Some(500.0));
627        assert_eq!(merged.total_envelope_height(), Some(300.0));
628
629        // Test serialization/deserialization of merged stats
630        let column_stats = merged.to_column_statistics().unwrap();
631        let deserialized = GeoStatistics::try_from_column_statistics(&column_stats)
632            .unwrap()
633            .unwrap();
634        assert_eq!(deserialized, merged);
635    }
636
637    #[test]
638    fn test_partial_merge() {
639        let stats1 = GeoStatistics::empty()
640            .with_total_geometries(50)
641            .with_total_size_bytes(5000);
642
643        let stats2 = GeoStatistics::empty()
644            .with_puntal_count(20)
645            .with_lineal_count(30);
646
647        let mut merged = stats1.clone();
648        merged.merge(&stats2);
649
650        // Check merged results
651        assert_eq!(merged.total_geometries(), Some(50));
652        assert_eq!(merged.total_size_bytes(), Some(5000));
653        assert_eq!(merged.puntal_count(), Some(20));
654        assert_eq!(merged.lineal_count(), Some(30));
655        assert_eq!(merged.polygonal_count(), Some(0));
656
657        // Test serialization/deserialization of partially merged stats
658        let column_stats = merged.to_column_statistics().unwrap();
659        let deserialized = GeoStatistics::try_from_column_statistics(&column_stats)
660            .unwrap()
661            .unwrap();
662        assert_eq!(deserialized, merged);
663    }
664}