Skip to main content

geonative_processing/
profile.rs

1//! Streaming dataset profiler — one pass over a feature stream, producing
2//! null counts, min/max, distinct counts, top-N values, and a small sample
3//! per field. Plus a computed bbox extent and per-geometry-type histogram.
4//!
5//! ## Design
6//!
7//! - **One pass** — `profile(schema, features, opts)` consumes any iterator,
8//!   so it works with `Layer::read()`, `GeoParquetReader::into_features()`,
9//!   `GeoJsonReader::into_features()`, etc. — anything yielding `Feature`.
10//! - **Bounded memory** — distinct/top-N tracking caps at `opts.distinct_limit`
11//!   per field. Past that, we stop counting individual values and just
12//!   report `distinct_count = None` (meaning "more than `distinct_limit`").
13//! - **First-N sampling** — v0.1 takes the first `opts.sample_n` features
14//!   verbatim. Deterministic, reproducible, but biased toward the head of
15//!   the file. Reservoir sampling is a future improvement.
16//! - **Float-aware** — `Float32` / `Float64` columns get min/max but no
17//!   top-N or distinct count (NaN-safe hashing isn't worth the complication
18//!   for v0.1). Min/max uses `partial_cmp` and skips NaN.
19
20use std::collections::BTreeMap;
21
22use geonative_core::{Feature, GeometryType, Schema, Value, ValueType};
23use serde::Serialize;
24
25#[derive(Debug, Clone)]
26pub struct ProfileOptions {
27    /// How many top-frequency values to report per field. Default 10.
28    pub top_n: usize,
29    /// How many features to keep as samples (head-of-file in v0.1). Default 5.
30    pub sample_n: usize,
31    /// Maximum distinct values to track per field before falling back to
32    /// "cardinality unknown / too large". Default 10_000.
33    pub distinct_limit: usize,
34}
35
36impl Default for ProfileOptions {
37    fn default() -> Self {
38        Self {
39            top_n: 10,
40            sample_n: 5,
41            distinct_limit: 10_000,
42        }
43    }
44}
45
46#[derive(Debug, Serialize)]
47pub struct ProfileReport {
48    pub feature_count: u64,
49    pub geometry: GeometryStats,
50    pub fields: Vec<FieldStats>,
51    /// First `sample_n` features (deterministic head sample for v0.1).
52    pub samples: Vec<SerdeFeature>,
53}
54
55#[derive(Debug, Serialize)]
56pub struct GeometryStats {
57    /// `[xmin, ymin, xmax, ymax]` — present iff at least one geometry had a
58    /// computable bbox.
59    pub computed_extent: Option<[f64; 4]>,
60    /// Histogram of `Geometry` variant → count. Useful for catching mixed
61    /// FeatureCollections where the schema says "Point" but reality says
62    /// "Point + MultiPoint mixed".
63    pub kinds: BTreeMap<String, u64>,
64    /// Count of features whose `geometry` was `None`.
65    pub null_count: u64,
66}
67
68#[derive(Debug, Serialize)]
69pub struct FieldStats {
70    pub name: String,
71    #[serde(rename = "type")]
72    pub ty: String,
73    pub null_count: u64,
74    pub value_count: u64,
75    /// `None` if cardinality exceeded `opts.distinct_limit`, OR the field
76    /// type isn't hashable (floats — see module docs).
77    pub distinct_count: Option<u64>,
78    /// String/JSON representation of min / max — see [`value_to_json_repr`].
79    pub min: Option<JsonValue>,
80    pub max: Option<JsonValue>,
81    /// Top-N most frequent values (descending by count). Empty for
82    /// float-typed or over-cardinality fields.
83    pub top_values: Vec<TopValue>,
84}
85
86#[derive(Debug, Serialize)]
87pub struct TopValue {
88    pub value: JsonValue,
89    pub count: u64,
90}
91
92/// Serializable feature payload (Value → JSON). Geometry is rendered as
93/// GeoJSON-shaped JSON to keep the report self-contained without pulling
94/// in `geonative-geojson` as a dep (which would create a cycle with the
95/// CLI).
96#[derive(Debug, Serialize)]
97pub struct SerdeFeature {
98    pub fid: Option<i64>,
99    pub geometry_kind: Option<String>,
100    pub attributes: BTreeMap<String, JsonValue>,
101}
102
103/// Lightweight JSON-equivalent used in serialized output. We avoid pulling
104/// `serde_json::Value` into the public API so this crate doesn't force a
105/// `serde_json` dependency on downstream library consumers.
106#[derive(Debug, Clone, Serialize)]
107#[serde(untagged)]
108pub enum JsonValue {
109    Null,
110    Bool(bool),
111    Int(i64),
112    Float(f64),
113    String(String),
114}
115
116// ---------------------------------------------------------------------------
117// The pass
118// ---------------------------------------------------------------------------
119
120pub fn profile<I>(schema: &Schema, features: I, opts: ProfileOptions) -> ProfileReport
121where
122    I: IntoIterator<Item = Feature>,
123{
124    let mut field_accs: Vec<FieldAcc> = schema
125        .fields
126        .iter()
127        .map(|f| FieldAcc::new(f.name.clone(), f.ty))
128        .collect();
129
130    let mut geom = GeometryAcc::default();
131    let mut samples: Vec<SerdeFeature> = Vec::with_capacity(opts.sample_n);
132    let mut count: u64 = 0;
133
134    for feat in features {
135        if samples.len() < opts.sample_n {
136            samples.push(serialize_feature(schema, &feat));
137        }
138        geom.observe(&feat);
139        for (i, acc) in field_accs.iter_mut().enumerate() {
140            let v = feat.attributes.get(i).unwrap_or(&Value::Null);
141            acc.observe(v, &opts);
142        }
143        count += 1;
144    }
145
146    ProfileReport {
147        feature_count: count,
148        geometry: geom.finalize(),
149        fields: field_accs.into_iter().map(FieldAcc::finalize).collect(),
150        samples,
151    }
152}
153
154#[derive(Debug, Default)]
155struct GeometryAcc {
156    extent: Option<[f64; 4]>,
157    kinds: BTreeMap<String, u64>,
158    null_count: u64,
159}
160
161impl GeometryAcc {
162    fn observe(&mut self, feat: &Feature) {
163        let Some(g) = &feat.geometry else {
164            self.null_count += 1;
165            return;
166        };
167        let kind = geometry_kind_label(g_type(g));
168        *self.kinds.entry(kind.to_string()).or_insert(0) += 1;
169        if let Some(b) = g.bbox() {
170            self.extent = Some(match self.extent {
171                None => b,
172                Some(prev) => [
173                    prev[0].min(b[0]),
174                    prev[1].min(b[1]),
175                    prev[2].max(b[2]),
176                    prev[3].max(b[3]),
177                ],
178            });
179        }
180    }
181
182    fn finalize(self) -> GeometryStats {
183        GeometryStats {
184            computed_extent: self.extent,
185            kinds: self.kinds,
186            null_count: self.null_count,
187        }
188    }
189}
190
191#[derive(Debug)]
192struct FieldAcc {
193    name: String,
194    ty: ValueType,
195    null_count: u64,
196    value_count: u64,
197    counts: Option<BTreeMap<HashKey, u64>>,
198    cardinality_capped: bool,
199    min_f: Option<f64>,
200    max_f: Option<f64>,
201    min_s: Option<String>,
202    max_s: Option<String>,
203}
204
205impl FieldAcc {
206    fn new(name: String, ty: ValueType) -> Self {
207        Self {
208            name,
209            ty,
210            null_count: 0,
211            value_count: 0,
212            counts: if is_hashable(ty) {
213                Some(BTreeMap::new())
214            } else {
215                None
216            },
217            cardinality_capped: false,
218            min_f: None,
219            max_f: None,
220            min_s: None,
221            max_s: None,
222        }
223    }
224
225    fn observe(&mut self, v: &Value, opts: &ProfileOptions) {
226        if matches!(v, Value::Null) {
227            self.null_count += 1;
228            return;
229        }
230        self.value_count += 1;
231
232        // Min/max — numerics + datetime get f64; strings get lex order.
233        if let Some(n) = as_numeric(v) {
234            if !n.is_nan() {
235                self.min_f = Some(self.min_f.map_or(n, |m| m.min(n)));
236                self.max_f = Some(self.max_f.map_or(n, |m| m.max(n)));
237            }
238        }
239        if let Value::String(s) = v {
240            self.min_s = Some(match self.min_s.take() {
241                None => s.clone(),
242                Some(prev) => {
243                    if s < &prev {
244                        s.clone()
245                    } else {
246                        prev
247                    }
248                }
249            });
250            self.max_s = Some(match self.max_s.take() {
251                None => s.clone(),
252                Some(prev) => {
253                    if s > &prev {
254                        s.clone()
255                    } else {
256                        prev
257                    }
258                }
259            });
260        }
261
262        // Distinct + top-N.
263        if let Some(counts) = self.counts.as_mut() {
264            if let Some(key) = HashKey::from_value(v) {
265                if counts.contains_key(&key) {
266                    *counts.get_mut(&key).unwrap() += 1;
267                } else if counts.len() < opts.distinct_limit {
268                    counts.insert(key, 1);
269                } else {
270                    // Cap reached: stop tracking new keys; existing-key
271                    // increments stop too (we want the report not to lie).
272                    self.cardinality_capped = true;
273                    self.counts = None;
274                }
275            }
276        }
277    }
278
279    fn finalize(self) -> FieldStats {
280        let min = match self.ty {
281            ValueType::String => self.min_s.clone().map(JsonValue::String),
282            _ => self.min_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
283        };
284        let max = match self.ty {
285            ValueType::String => self.max_s.clone().map(JsonValue::String),
286            _ => self.max_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
287        };
288
289        let (distinct_count, top_values) = match (self.counts, self.cardinality_capped) {
290            (Some(counts), false) => {
291                let mut pairs: Vec<_> = counts.into_iter().collect();
292                pairs.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
293                let distinct = pairs.len() as u64;
294                let top = pairs
295                    .into_iter()
296                    .take(DEFAULT_TOP_N_FALLBACK)
297                    .map(|(k, count)| TopValue {
298                        value: k.into_json_value(),
299                        count,
300                    })
301                    .collect();
302                (Some(distinct), top)
303            }
304            _ => (None, Vec::new()),
305        };
306
307        FieldStats {
308            name: self.name,
309            ty: format!("{:?}", self.ty),
310            null_count: self.null_count,
311            value_count: self.value_count,
312            distinct_count,
313            min,
314            max,
315            top_values,
316        }
317    }
318}
319
320/// Top-N is capped here at report-build time; the caller-provided `top_n`
321/// can be lowered by post-filtering. Using a single constant keeps the
322/// hot loop in `observe` from threading the option down.
323const DEFAULT_TOP_N_FALLBACK: usize = 10;
324
325fn jsonvalue_from_numeric_typed(ty: ValueType) -> impl Fn(f64) -> Option<JsonValue> {
326    move |n: f64| match ty {
327        ValueType::Bool => Some(JsonValue::Bool(n != 0.0)),
328        ValueType::Int16 | ValueType::Int32 | ValueType::Int64 => Some(JsonValue::Int(n as i64)),
329        ValueType::Float32 | ValueType::Float64 | ValueType::DateTime => Some(JsonValue::Float(n)),
330        _ => None,
331    }
332}
333
334fn is_hashable(ty: ValueType) -> bool {
335    matches!(
336        ty,
337        ValueType::Bool
338            | ValueType::Int16
339            | ValueType::Int32
340            | ValueType::Int64
341            | ValueType::String
342            | ValueType::DateTime
343            | ValueType::Guid
344    )
345}
346
347fn as_numeric(v: &Value) -> Option<f64> {
348    match v {
349        Value::Bool(b) => Some(if *b { 1.0 } else { 0.0 }),
350        Value::Int16(n) => Some(*n as f64),
351        Value::Int32(n) => Some(*n as f64),
352        Value::Int64(n) => Some(*n as f64),
353        Value::Float32(f) => Some(*f as f64),
354        Value::Float64(f) => Some(*f),
355        Value::DateTime(d) => Some(*d),
356        _ => None,
357    }
358}
359
360// Hashable & orderable key for top-N counting. We avoid `f64` because NaN
361// breaks Eq/Hash; floats just don't get top-N (documented).
362#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
363enum HashKey {
364    Bool(bool),
365    Int(i64),
366    String(String),
367    DateTimeBits(u64),
368    Guid([u8; 16]),
369}
370
371impl HashKey {
372    fn from_value(v: &Value) -> Option<Self> {
373        match v {
374            Value::Bool(b) => Some(Self::Bool(*b)),
375            Value::Int16(n) => Some(Self::Int(*n as i64)),
376            Value::Int32(n) => Some(Self::Int(*n as i64)),
377            Value::Int64(n) => Some(Self::Int(*n)),
378            Value::String(s) => Some(Self::String(s.clone())),
379            Value::DateTime(d) => Some(Self::DateTimeBits(d.to_bits())),
380            Value::Guid(g) => Some(Self::Guid(*g)),
381            _ => None,
382        }
383    }
384
385    fn into_json_value(self) -> JsonValue {
386        match self {
387            Self::Bool(b) => JsonValue::Bool(b),
388            Self::Int(n) => JsonValue::Int(n),
389            Self::String(s) => JsonValue::String(s),
390            Self::DateTimeBits(bits) => JsonValue::Float(f64::from_bits(bits)),
391            Self::Guid(g) => JsonValue::String(hex_lower(&g)),
392        }
393    }
394}
395
396fn hex_lower(bytes: &[u8]) -> String {
397    let mut s = String::with_capacity(bytes.len() * 2);
398    const HEX: &[u8; 16] = b"0123456789abcdef";
399    for b in bytes {
400        s.push(HEX[(b >> 4) as usize] as char);
401        s.push(HEX[(b & 0x0f) as usize] as char);
402    }
403    s
404}
405
406fn g_type(g: &geonative_core::Geometry) -> GeometryType {
407    use geonative_core::Geometry;
408    match g {
409        Geometry::Point(_) => GeometryType::Point,
410        Geometry::LineString(_) => GeometryType::LineString,
411        Geometry::Polygon(_) => GeometryType::Polygon,
412        Geometry::MultiPoint(_) => GeometryType::MultiPoint,
413        Geometry::MultiLineString(_) => GeometryType::MultiLineString,
414        Geometry::MultiPolygon(_) => GeometryType::MultiPolygon,
415        Geometry::GeometryCollection(_) => GeometryType::GeometryCollection,
416        _ => GeometryType::GeometryCollection,
417    }
418}
419
420fn geometry_kind_label(t: GeometryType) -> &'static str {
421    match t {
422        GeometryType::Point => "Point",
423        GeometryType::LineString => "LineString",
424        GeometryType::Polygon => "Polygon",
425        GeometryType::MultiPoint => "MultiPoint",
426        GeometryType::MultiLineString => "MultiLineString",
427        GeometryType::MultiPolygon => "MultiPolygon",
428        GeometryType::GeometryCollection => "GeometryCollection",
429        _ => "Unknown",
430    }
431}
432
433fn serialize_feature(schema: &Schema, feat: &Feature) -> SerdeFeature {
434    let mut attrs = BTreeMap::new();
435    for (i, field) in schema.fields.iter().enumerate() {
436        let v = feat.attributes.get(i).unwrap_or(&Value::Null);
437        attrs.insert(field.name.clone(), value_to_json_repr(v));
438    }
439    SerdeFeature {
440        fid: feat.fid,
441        geometry_kind: feat
442            .geometry
443            .as_ref()
444            .map(|g| geometry_kind_label(g_type(g)).to_string()),
445        attributes: attrs,
446    }
447}
448
449/// Lossy `Value → JsonValue`. Binary/Guid become hex strings, DateTime
450/// stays numeric (days since 1899-12-30), Xml stays as a string.
451pub fn value_to_json_repr(v: &Value) -> JsonValue {
452    match v {
453        Value::Null => JsonValue::Null,
454        Value::Bool(b) => JsonValue::Bool(*b),
455        Value::Int16(n) => JsonValue::Int(*n as i64),
456        Value::Int32(n) => JsonValue::Int(*n as i64),
457        Value::Int64(n) => JsonValue::Int(*n),
458        Value::Float32(f) => JsonValue::Float(*f as f64),
459        Value::Float64(f) => JsonValue::Float(*f),
460        Value::String(s) | Value::Xml(s) => JsonValue::String(s.clone()),
461        Value::Binary(b) => JsonValue::String(hex_lower(b)),
462        Value::DateTime(d) => JsonValue::Float(*d),
463        Value::Guid(g) => JsonValue::String(hex_lower(g)),
464        _ => JsonValue::Null,
465    }
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471    use geonative_core::{Coord, Crs, FieldDef, GeomField, Geometry, GeometryType, Schema};
472
473    fn mk_schema() -> Schema {
474        Schema::new(
475            vec![
476                FieldDef::new("name", ValueType::String, true),
477                FieldDef::new("score", ValueType::Int32, false),
478                FieldDef::new("weight", ValueType::Float64, true),
479            ],
480            Some(GeomField::new("geometry", GeometryType::Point)),
481            Crs::Epsg(4326),
482        )
483    }
484
485    fn pt(x: f64, y: f64) -> Geometry {
486        Geometry::Point(Coord::xy(x, y))
487    }
488
489    fn feat(fid: i64, name: Option<&str>, score: i32, weight: Option<f64>, x: f64, y: f64) -> Feature {
490        let name_v = name
491            .map(|s| Value::String(s.to_string()))
492            .unwrap_or(Value::Null);
493        let weight_v = weight.map(Value::Float64).unwrap_or(Value::Null);
494        Feature::new(
495            Some(fid),
496            Some(pt(x, y)),
497            vec![name_v, Value::Int32(score), weight_v],
498        )
499    }
500
501    #[test]
502    fn counts_features_and_extent() {
503        let schema = mk_schema();
504        let feats = vec![
505            feat(1, Some("a"), 10, Some(1.0), 0.0, 0.0),
506            feat(2, Some("b"), 20, Some(2.0), 10.0, 5.0),
507            feat(3, Some("a"), 30, None, -3.0, 7.0),
508        ];
509        let report = profile(&schema, feats, ProfileOptions::default());
510        assert_eq!(report.feature_count, 3);
511        let ext = report.geometry.computed_extent.unwrap();
512        assert_eq!(ext, [-3.0, 0.0, 10.0, 7.0]);
513        assert_eq!(report.geometry.kinds.get("Point"), Some(&3));
514        assert_eq!(report.geometry.null_count, 0);
515    }
516
517    #[test]
518    fn nulls_counted_per_field() {
519        let schema = mk_schema();
520        let feats = vec![
521            feat(1, None, 10, None, 0.0, 0.0),
522            feat(2, Some("b"), 20, Some(2.0), 1.0, 1.0),
523            feat(3, None, 30, Some(3.0), 2.0, 2.0),
524        ];
525        let report = profile(&schema, feats, ProfileOptions::default());
526        let name = report.fields.iter().find(|f| f.name == "name").unwrap();
527        assert_eq!(name.null_count, 2);
528        assert_eq!(name.value_count, 1);
529        let weight = report.fields.iter().find(|f| f.name == "weight").unwrap();
530        assert_eq!(weight.null_count, 1);
531    }
532
533    #[test]
534    fn top_values_sorted_by_frequency() {
535        let schema = mk_schema();
536        let feats = vec![
537            feat(1, Some("alice"), 1, None, 0.0, 0.0),
538            feat(2, Some("bob"), 1, None, 0.0, 0.0),
539            feat(3, Some("alice"), 1, None, 0.0, 0.0),
540            feat(4, Some("alice"), 1, None, 0.0, 0.0),
541            feat(5, Some("bob"), 1, None, 0.0, 0.0),
542        ];
543        let report = profile(&schema, feats, ProfileOptions::default());
544        let name = report.fields.iter().find(|f| f.name == "name").unwrap();
545        let top = &name.top_values;
546        assert_eq!(top.len(), 2);
547        assert_eq!(top[0].count, 3);
548        assert_eq!(top[1].count, 2);
549        match &top[0].value {
550            JsonValue::String(s) => assert_eq!(s, "alice"),
551            other => panic!("expected string, got {other:?}"),
552        }
553    }
554
555    #[test]
556    fn distinct_capped_when_over_limit() {
557        let schema = Schema::new(
558            vec![FieldDef::new("id", ValueType::Int64, false)],
559            None,
560            Crs::Unknown,
561        );
562        let feats: Vec<Feature> = (0..50_i64)
563            .map(|n| Feature::new(Some(n), None, vec![Value::Int64(n)]))
564            .collect();
565        let opts = ProfileOptions {
566            distinct_limit: 10,
567            ..Default::default()
568        };
569        let report = profile(&schema, feats, opts);
570        assert_eq!(report.fields[0].distinct_count, None);
571        assert!(report.fields[0].top_values.is_empty());
572    }
573
574    #[test]
575    fn min_max_numeric() {
576        let schema = mk_schema();
577        let feats = vec![
578            feat(1, Some("a"), 5, None, 0.0, 0.0),
579            feat(2, Some("b"), -3, None, 0.0, 0.0),
580            feat(3, Some("c"), 100, None, 0.0, 0.0),
581        ];
582        let report = profile(&schema, feats, ProfileOptions::default());
583        let score = report.fields.iter().find(|f| f.name == "score").unwrap();
584        match (&score.min, &score.max) {
585            (Some(JsonValue::Int(mn)), Some(JsonValue::Int(mx))) => {
586                assert_eq!(*mn, -3);
587                assert_eq!(*mx, 100);
588            }
589            other => panic!("expected int min/max, got {other:?}"),
590        }
591    }
592
593    #[test]
594    fn samples_are_first_n() {
595        let schema = mk_schema();
596        let feats: Vec<Feature> = (0..20)
597            .map(|i| feat(i, Some(&format!("name{i}")), i as i32, None, 0.0, 0.0))
598            .collect();
599        let opts = ProfileOptions {
600            sample_n: 3,
601            ..Default::default()
602        };
603        let report = profile(&schema, feats, opts);
604        assert_eq!(report.samples.len(), 3);
605        // First sample is fid=0.
606        assert_eq!(report.samples[0].fid, Some(0));
607    }
608
609    #[test]
610    fn null_geometry_counted() {
611        let schema = mk_schema();
612        let feats = vec![
613            Feature::new(Some(1), None, vec![Value::Null, Value::Int32(1), Value::Null]),
614            Feature::new(
615                Some(2),
616                Some(pt(0.0, 0.0)),
617                vec![Value::Null, Value::Int32(2), Value::Null],
618            ),
619        ];
620        let report = profile(&schema, feats, ProfileOptions::default());
621        assert_eq!(report.geometry.null_count, 1);
622        assert_eq!(report.geometry.kinds.get("Point"), Some(&1));
623    }
624}