Skip to main content

geonative_processing/
profile.rs

1//! Streaming dataset profiler — one pass over a feature stream, producing
2//! null counts, min/max, distinct counts, top-N values, and a small sample
3//! per field. Plus a computed bbox extent and per-geometry-type histogram.
4//!
5//! ## Design
6//!
7//! - **One pass** — `profile(schema, features, opts)` consumes any iterator,
8//!   so it works with `Layer::read()`, `GeoParquetReader::into_features()`,
9//!   `GeoJsonReader::into_features()`, etc. — anything yielding `Feature`.
10//! - **Bounded memory** — distinct/top-N tracking caps at `opts.distinct_limit`
11//!   per field. Past that, we stop counting individual values and just
12//!   report `distinct_count = None` (meaning "more than `distinct_limit`").
13//! - **First-N sampling** — v0.1 takes the first `opts.sample_n` features
14//!   verbatim. Deterministic, reproducible, but biased toward the head of
15//!   the file. Reservoir sampling is a future improvement.
16//! - **Float-aware** — `Float32` / `Float64` columns get min/max but no
17//!   top-N or distinct count (NaN-safe hashing isn't worth the complication
18//!   for v0.1). Min/max uses `partial_cmp` and skips NaN.
19
20use std::collections::BTreeMap;
21
22use geonative_core::{Feature, GeometryType, Schema, Value, ValueType};
23use serde::Serialize;
24
25#[derive(Debug, Clone)]
26pub struct ProfileOptions {
27    /// How many top-frequency values to report per field. Default 10.
28    pub top_n: usize,
29    /// How many features to keep as samples (head-of-file in v0.1). Default 5.
30    pub sample_n: usize,
31    /// Maximum distinct values to track per field before falling back to
32    /// "cardinality unknown / too large". Default 10_000.
33    pub distinct_limit: usize,
34}
35
36impl Default for ProfileOptions {
37    fn default() -> Self {
38        Self {
39            top_n: 10,
40            sample_n: 5,
41            distinct_limit: 10_000,
42        }
43    }
44}
45
46#[derive(Debug, Serialize)]
47pub struct ProfileReport {
48    pub feature_count: u64,
49    pub geometry: GeometryStats,
50    pub fields: Vec<FieldStats>,
51    /// First `sample_n` features (deterministic head sample for v0.1).
52    pub samples: Vec<SerdeFeature>,
53}
54
55#[derive(Debug, Serialize)]
56pub struct GeometryStats {
57    /// `[xmin, ymin, xmax, ymax]` — present iff at least one geometry had a
58    /// computable bbox.
59    pub computed_extent: Option<[f64; 4]>,
60    /// Histogram of `Geometry` variant → count. Useful for catching mixed
61    /// FeatureCollections where the schema says "Point" but reality says
62    /// "Point + MultiPoint mixed".
63    pub kinds: BTreeMap<String, u64>,
64    /// Count of features whose `geometry` was `None`.
65    pub null_count: u64,
66}
67
68#[derive(Debug, Serialize)]
69pub struct FieldStats {
70    pub name: String,
71    #[serde(rename = "type")]
72    pub ty: String,
73    pub null_count: u64,
74    pub value_count: u64,
75    /// `None` if cardinality exceeded `opts.distinct_limit`, OR the field
76    /// type isn't hashable (floats — see module docs).
77    pub distinct_count: Option<u64>,
78    /// String/JSON representation of min / max — see [`value_to_json_repr`].
79    pub min: Option<JsonValue>,
80    pub max: Option<JsonValue>,
81    /// Top-N most frequent values (descending by count). Empty for
82    /// float-typed or over-cardinality fields.
83    pub top_values: Vec<TopValue>,
84}
85
86#[derive(Debug, Serialize)]
87pub struct TopValue {
88    pub value: JsonValue,
89    pub count: u64,
90}
91
92/// Serializable feature payload (Value → JSON). Geometry is rendered as
93/// GeoJSON-shaped JSON to keep the report self-contained without pulling
94/// in `geonative-geojson` as a dep (which would create a cycle with the
95/// CLI).
96#[derive(Debug, Serialize)]
97pub struct SerdeFeature {
98    pub fid: Option<i64>,
99    pub geometry_kind: Option<String>,
100    pub attributes: BTreeMap<String, JsonValue>,
101}
102
103/// Lightweight JSON-equivalent used in serialized output. We avoid pulling
104/// `serde_json::Value` into the public API so this crate doesn't force a
105/// `serde_json` dependency on downstream library consumers.
106#[derive(Debug, Clone, Serialize)]
107#[serde(untagged)]
108pub enum JsonValue {
109    Null,
110    Bool(bool),
111    Int(i64),
112    Float(f64),
113    String(String),
114}
115
116// ---------------------------------------------------------------------------
117// The pass
118// ---------------------------------------------------------------------------
119
120pub fn profile<I>(schema: &Schema, features: I, opts: ProfileOptions) -> ProfileReport
121where
122    I: IntoIterator<Item = Feature>,
123{
124    let mut field_accs: Vec<FieldAcc> = schema
125        .fields
126        .iter()
127        .map(|f| FieldAcc::new(f.name.clone(), f.ty))
128        .collect();
129
130    let mut geom = GeometryAcc::default();
131    let mut samples: Vec<SerdeFeature> = Vec::with_capacity(opts.sample_n);
132    let mut count: u64 = 0;
133
134    for feat in features {
135        if samples.len() < opts.sample_n {
136            samples.push(serialize_feature(schema, &feat));
137        }
138        geom.observe(&feat);
139        for (i, acc) in field_accs.iter_mut().enumerate() {
140            let v = feat.attributes.get(i).unwrap_or(&Value::Null);
141            acc.observe(v, &opts);
142        }
143        count += 1;
144    }
145
146    ProfileReport {
147        feature_count: count,
148        geometry: geom.finalize(),
149        fields: field_accs.into_iter().map(FieldAcc::finalize).collect(),
150        samples,
151    }
152}
153
154#[derive(Debug, Default)]
155struct GeometryAcc {
156    extent: Option<[f64; 4]>,
157    kinds: BTreeMap<String, u64>,
158    null_count: u64,
159}
160
161impl GeometryAcc {
162    fn observe(&mut self, feat: &Feature) {
163        let Some(g) = &feat.geometry else {
164            self.null_count += 1;
165            return;
166        };
167        let kind = geometry_kind_label(g_type(g));
168        *self.kinds.entry(kind.to_string()).or_insert(0) += 1;
169        if let Some(b) = g.bbox() {
170            self.extent = Some(match self.extent {
171                None => b,
172                Some(prev) => [
173                    prev[0].min(b[0]),
174                    prev[1].min(b[1]),
175                    prev[2].max(b[2]),
176                    prev[3].max(b[3]),
177                ],
178            });
179        }
180    }
181
182    fn finalize(self) -> GeometryStats {
183        GeometryStats {
184            computed_extent: self.extent,
185            kinds: self.kinds,
186            null_count: self.null_count,
187        }
188    }
189}
190
191#[derive(Debug)]
192struct FieldAcc {
193    name: String,
194    ty: ValueType,
195    null_count: u64,
196    value_count: u64,
197    counts: Option<BTreeMap<HashKey, u64>>,
198    cardinality_capped: bool,
199    min_f: Option<f64>,
200    max_f: Option<f64>,
201    min_s: Option<String>,
202    max_s: Option<String>,
203}
204
205impl FieldAcc {
206    fn new(name: String, ty: ValueType) -> Self {
207        Self {
208            name,
209            ty,
210            null_count: 0,
211            value_count: 0,
212            counts: if is_hashable(ty) {
213                Some(BTreeMap::new())
214            } else {
215                None
216            },
217            cardinality_capped: false,
218            min_f: None,
219            max_f: None,
220            min_s: None,
221            max_s: None,
222        }
223    }
224
225    fn observe(&mut self, v: &Value, opts: &ProfileOptions) {
226        if matches!(v, Value::Null) {
227            self.null_count += 1;
228            return;
229        }
230        self.value_count += 1;
231
232        // Min/max — numerics + datetime get f64; strings get lex order.
233        if let Some(n) = as_numeric(v) {
234            if !n.is_nan() {
235                self.min_f = Some(self.min_f.map_or(n, |m| m.min(n)));
236                self.max_f = Some(self.max_f.map_or(n, |m| m.max(n)));
237            }
238        }
239        if let Value::String(s) = v {
240            self.min_s = Some(match self.min_s.take() {
241                None => s.clone(),
242                Some(prev) => {
243                    if s < &prev {
244                        s.clone()
245                    } else {
246                        prev
247                    }
248                }
249            });
250            self.max_s = Some(match self.max_s.take() {
251                None => s.clone(),
252                Some(prev) => {
253                    if s > &prev {
254                        s.clone()
255                    } else {
256                        prev
257                    }
258                }
259            });
260        }
261
262        // Distinct + top-N.
263        if let Some(counts) = self.counts.as_mut() {
264            if let Some(key) = HashKey::from_value(v) {
265                if counts.contains_key(&key) {
266                    *counts.get_mut(&key).unwrap() += 1;
267                } else if counts.len() < opts.distinct_limit {
268                    counts.insert(key, 1);
269                } else {
270                    // Cap reached: stop tracking new keys; existing-key
271                    // increments stop too (we want the report not to lie).
272                    self.cardinality_capped = true;
273                    self.counts = None;
274                }
275            }
276        }
277    }
278
279    fn finalize(self) -> FieldStats {
280        let min = match self.ty {
281            ValueType::String => self.min_s.clone().map(JsonValue::String),
282            _ => self.min_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
283        };
284        let max = match self.ty {
285            ValueType::String => self.max_s.clone().map(JsonValue::String),
286            _ => self.max_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
287        };
288
289        let (distinct_count, top_values) = match (self.counts, self.cardinality_capped) {
290            (Some(counts), false) => {
291                let mut pairs: Vec<_> = counts.into_iter().collect();
292                pairs.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
293                let distinct = pairs.len() as u64;
294                let top = pairs
295                    .into_iter()
296                    .take(DEFAULT_TOP_N_FALLBACK)
297                    .map(|(k, count)| TopValue {
298                        value: k.into_json_value(),
299                        count,
300                    })
301                    .collect();
302                (Some(distinct), top)
303            }
304            _ => (None, Vec::new()),
305        };
306
307        FieldStats {
308            name: self.name,
309            ty: format!("{:?}", self.ty),
310            null_count: self.null_count,
311            value_count: self.value_count,
312            distinct_count,
313            min,
314            max,
315            top_values,
316        }
317    }
318}
319
320/// Top-N is capped here at report-build time; the caller-provided `top_n`
321/// can be lowered by post-filtering. Using a single constant keeps the
322/// hot loop in `observe` from threading the option down.
323const DEFAULT_TOP_N_FALLBACK: usize = 10;
324
325fn jsonvalue_from_numeric_typed(ty: ValueType) -> impl Fn(f64) -> Option<JsonValue> {
326    move |n: f64| match ty {
327        ValueType::Bool => Some(JsonValue::Bool(n != 0.0)),
328        ValueType::Int16 | ValueType::Int32 | ValueType::Int64 => Some(JsonValue::Int(n as i64)),
329        ValueType::Float32 | ValueType::Float64 | ValueType::DateTime => Some(JsonValue::Float(n)),
330        _ => None,
331    }
332}
333
334fn is_hashable(ty: ValueType) -> bool {
335    matches!(
336        ty,
337        ValueType::Bool
338            | ValueType::Int16
339            | ValueType::Int32
340            | ValueType::Int64
341            | ValueType::String
342            | ValueType::DateTime
343            | ValueType::Guid
344    )
345}
346
347fn as_numeric(v: &Value) -> Option<f64> {
348    match v {
349        Value::Bool(b) => Some(if *b { 1.0 } else { 0.0 }),
350        Value::Int16(n) => Some(*n as f64),
351        Value::Int32(n) => Some(*n as f64),
352        Value::Int64(n) => Some(*n as f64),
353        Value::Float32(f) => Some(*f as f64),
354        Value::Float64(f) => Some(*f),
355        Value::DateTime(d) => Some(*d),
356        _ => None,
357    }
358}
359
360// Hashable & orderable key for top-N counting. We avoid `f64` because NaN
361// breaks Eq/Hash; floats just don't get top-N (documented).
362#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
363enum HashKey {
364    Bool(bool),
365    Int(i64),
366    String(String),
367    DateTimeBits(u64),
368    Guid([u8; 16]),
369}
370
371impl HashKey {
372    fn from_value(v: &Value) -> Option<Self> {
373        match v {
374            Value::Bool(b) => Some(Self::Bool(*b)),
375            Value::Int16(n) => Some(Self::Int(*n as i64)),
376            Value::Int32(n) => Some(Self::Int(*n as i64)),
377            Value::Int64(n) => Some(Self::Int(*n)),
378            Value::String(s) => Some(Self::String(s.clone())),
379            Value::DateTime(d) => Some(Self::DateTimeBits(d.to_bits())),
380            Value::Guid(g) => Some(Self::Guid(*g)),
381            _ => None,
382        }
383    }
384
385    fn into_json_value(self) -> JsonValue {
386        match self {
387            Self::Bool(b) => JsonValue::Bool(b),
388            Self::Int(n) => JsonValue::Int(n),
389            Self::String(s) => JsonValue::String(s),
390            Self::DateTimeBits(bits) => JsonValue::Float(f64::from_bits(bits)),
391            Self::Guid(g) => JsonValue::String(hex_lower(&g)),
392        }
393    }
394}
395
396fn hex_lower(bytes: &[u8]) -> String {
397    let mut s = String::with_capacity(bytes.len() * 2);
398    const HEX: &[u8; 16] = b"0123456789abcdef";
399    for b in bytes {
400        s.push(HEX[(b >> 4) as usize] as char);
401        s.push(HEX[(b & 0x0f) as usize] as char);
402    }
403    s
404}
405
406fn g_type(g: &geonative_core::Geometry) -> GeometryType {
407    use geonative_core::Geometry;
408    match g {
409        Geometry::Point(_) => GeometryType::Point,
410        Geometry::LineString(_) => GeometryType::LineString,
411        Geometry::Polygon(_) => GeometryType::Polygon,
412        Geometry::MultiPoint(_) => GeometryType::MultiPoint,
413        Geometry::MultiLineString(_) => GeometryType::MultiLineString,
414        Geometry::MultiPolygon(_) => GeometryType::MultiPolygon,
415        Geometry::GeometryCollection(_) => GeometryType::GeometryCollection,
416        _ => GeometryType::GeometryCollection,
417    }
418}
419
420fn geometry_kind_label(t: GeometryType) -> &'static str {
421    match t {
422        GeometryType::Point => "Point",
423        GeometryType::LineString => "LineString",
424        GeometryType::Polygon => "Polygon",
425        GeometryType::MultiPoint => "MultiPoint",
426        GeometryType::MultiLineString => "MultiLineString",
427        GeometryType::MultiPolygon => "MultiPolygon",
428        GeometryType::GeometryCollection => "GeometryCollection",
429        _ => "Unknown",
430    }
431}
432
433fn serialize_feature(schema: &Schema, feat: &Feature) -> SerdeFeature {
434    let mut attrs = BTreeMap::new();
435    for (i, field) in schema.fields.iter().enumerate() {
436        let v = feat.attributes.get(i).unwrap_or(&Value::Null);
437        attrs.insert(field.name.clone(), value_to_json_repr(v));
438    }
439    SerdeFeature {
440        fid: feat.fid,
441        geometry_kind: feat
442            .geometry
443            .as_ref()
444            .map(|g| geometry_kind_label(g_type(g)).to_string()),
445        attributes: attrs,
446    }
447}
448
449/// Lossy `Value → JsonValue`. Binary/Guid become hex strings, DateTime
450/// stays numeric (days since 1899-12-30), Xml stays as a string.
451pub fn value_to_json_repr(v: &Value) -> JsonValue {
452    match v {
453        Value::Null => JsonValue::Null,
454        Value::Bool(b) => JsonValue::Bool(*b),
455        Value::Int16(n) => JsonValue::Int(*n as i64),
456        Value::Int32(n) => JsonValue::Int(*n as i64),
457        Value::Int64(n) => JsonValue::Int(*n),
458        Value::Float32(f) => JsonValue::Float(*f as f64),
459        Value::Float64(f) => JsonValue::Float(*f),
460        Value::String(s) | Value::Xml(s) => JsonValue::String(s.clone()),
461        Value::Binary(b) => JsonValue::String(hex_lower(b)),
462        Value::DateTime(d) => JsonValue::Float(*d),
463        Value::Guid(g) => JsonValue::String(hex_lower(g)),
464        _ => JsonValue::Null,
465    }
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471    use geonative_core::{Coord, Crs, FieldDef, GeomField, Geometry, GeometryType, Schema};
472
473    fn mk_schema() -> Schema {
474        Schema::new(
475            vec![
476                FieldDef::new("name", ValueType::String, true),
477                FieldDef::new("score", ValueType::Int32, false),
478                FieldDef::new("weight", ValueType::Float64, true),
479            ],
480            Some(GeomField::new("geometry", GeometryType::Point)),
481            Crs::Epsg(4326),
482        )
483    }
484
485    fn pt(x: f64, y: f64) -> Geometry {
486        Geometry::Point(Coord::xy(x, y))
487    }
488
489    fn feat(
490        fid: i64,
491        name: Option<&str>,
492        score: i32,
493        weight: Option<f64>,
494        x: f64,
495        y: f64,
496    ) -> Feature {
497        let name_v = name
498            .map(|s| Value::String(s.to_string()))
499            .unwrap_or(Value::Null);
500        let weight_v = weight.map(Value::Float64).unwrap_or(Value::Null);
501        Feature::new(
502            Some(fid),
503            Some(pt(x, y)),
504            vec![name_v, Value::Int32(score), weight_v],
505        )
506    }
507
508    #[test]
509    fn counts_features_and_extent() {
510        let schema = mk_schema();
511        let feats = vec![
512            feat(1, Some("a"), 10, Some(1.0), 0.0, 0.0),
513            feat(2, Some("b"), 20, Some(2.0), 10.0, 5.0),
514            feat(3, Some("a"), 30, None, -3.0, 7.0),
515        ];
516        let report = profile(&schema, feats, ProfileOptions::default());
517        assert_eq!(report.feature_count, 3);
518        let ext = report.geometry.computed_extent.unwrap();
519        assert_eq!(ext, [-3.0, 0.0, 10.0, 7.0]);
520        assert_eq!(report.geometry.kinds.get("Point"), Some(&3));
521        assert_eq!(report.geometry.null_count, 0);
522    }
523
524    #[test]
525    fn nulls_counted_per_field() {
526        let schema = mk_schema();
527        let feats = vec![
528            feat(1, None, 10, None, 0.0, 0.0),
529            feat(2, Some("b"), 20, Some(2.0), 1.0, 1.0),
530            feat(3, None, 30, Some(3.0), 2.0, 2.0),
531        ];
532        let report = profile(&schema, feats, ProfileOptions::default());
533        let name = report.fields.iter().find(|f| f.name == "name").unwrap();
534        assert_eq!(name.null_count, 2);
535        assert_eq!(name.value_count, 1);
536        let weight = report.fields.iter().find(|f| f.name == "weight").unwrap();
537        assert_eq!(weight.null_count, 1);
538    }
539
540    #[test]
541    fn top_values_sorted_by_frequency() {
542        let schema = mk_schema();
543        let feats = vec![
544            feat(1, Some("alice"), 1, None, 0.0, 0.0),
545            feat(2, Some("bob"), 1, None, 0.0, 0.0),
546            feat(3, Some("alice"), 1, None, 0.0, 0.0),
547            feat(4, Some("alice"), 1, None, 0.0, 0.0),
548            feat(5, Some("bob"), 1, None, 0.0, 0.0),
549        ];
550        let report = profile(&schema, feats, ProfileOptions::default());
551        let name = report.fields.iter().find(|f| f.name == "name").unwrap();
552        let top = &name.top_values;
553        assert_eq!(top.len(), 2);
554        assert_eq!(top[0].count, 3);
555        assert_eq!(top[1].count, 2);
556        match &top[0].value {
557            JsonValue::String(s) => assert_eq!(s, "alice"),
558            other => panic!("expected string, got {other:?}"),
559        }
560    }
561
562    #[test]
563    fn distinct_capped_when_over_limit() {
564        let schema = Schema::new(
565            vec![FieldDef::new("id", ValueType::Int64, false)],
566            None,
567            Crs::Unknown,
568        );
569        let feats: Vec<Feature> = (0..50_i64)
570            .map(|n| Feature::new(Some(n), None, vec![Value::Int64(n)]))
571            .collect();
572        let opts = ProfileOptions {
573            distinct_limit: 10,
574            ..Default::default()
575        };
576        let report = profile(&schema, feats, opts);
577        assert_eq!(report.fields[0].distinct_count, None);
578        assert!(report.fields[0].top_values.is_empty());
579    }
580
581    #[test]
582    fn min_max_numeric() {
583        let schema = mk_schema();
584        let feats = vec![
585            feat(1, Some("a"), 5, None, 0.0, 0.0),
586            feat(2, Some("b"), -3, None, 0.0, 0.0),
587            feat(3, Some("c"), 100, None, 0.0, 0.0),
588        ];
589        let report = profile(&schema, feats, ProfileOptions::default());
590        let score = report.fields.iter().find(|f| f.name == "score").unwrap();
591        match (&score.min, &score.max) {
592            (Some(JsonValue::Int(mn)), Some(JsonValue::Int(mx))) => {
593                assert_eq!(*mn, -3);
594                assert_eq!(*mx, 100);
595            }
596            other => panic!("expected int min/max, got {other:?}"),
597        }
598    }
599
600    #[test]
601    fn samples_are_first_n() {
602        let schema = mk_schema();
603        let feats: Vec<Feature> = (0..20)
604            .map(|i| feat(i, Some(&format!("name{i}")), i as i32, None, 0.0, 0.0))
605            .collect();
606        let opts = ProfileOptions {
607            sample_n: 3,
608            ..Default::default()
609        };
610        let report = profile(&schema, feats, opts);
611        assert_eq!(report.samples.len(), 3);
612        // First sample is fid=0.
613        assert_eq!(report.samples[0].fid, Some(0));
614    }
615
616    #[test]
617    fn null_geometry_counted() {
618        let schema = mk_schema();
619        let feats = vec![
620            Feature::new(
621                Some(1),
622                None,
623                vec![Value::Null, Value::Int32(1), Value::Null],
624            ),
625            Feature::new(
626                Some(2),
627                Some(pt(0.0, 0.0)),
628                vec![Value::Null, Value::Int32(2), Value::Null],
629            ),
630        ];
631        let report = profile(&schema, feats, ProfileOptions::default());
632        assert_eq!(report.geometry.null_count, 1);
633        assert_eq!(report.geometry.kinds.get("Point"), Some(&1));
634    }
635}