1use std::collections::BTreeMap;
21
22use geonative_core::{Feature, GeometryType, Schema, Value, ValueType};
23use serde::Serialize;
24
25#[derive(Debug, Clone)]
26pub struct ProfileOptions {
27 pub top_n: usize,
29 pub sample_n: usize,
31 pub distinct_limit: usize,
34}
35
36impl Default for ProfileOptions {
37 fn default() -> Self {
38 Self {
39 top_n: 10,
40 sample_n: 5,
41 distinct_limit: 10_000,
42 }
43 }
44}
45
46#[derive(Debug, Serialize)]
47pub struct ProfileReport {
48 pub feature_count: u64,
49 pub geometry: GeometryStats,
50 pub fields: Vec<FieldStats>,
51 pub samples: Vec<SerdeFeature>,
53}
54
55#[derive(Debug, Serialize)]
56pub struct GeometryStats {
57 pub computed_extent: Option<[f64; 4]>,
60 pub kinds: BTreeMap<String, u64>,
64 pub null_count: u64,
66}
67
68#[derive(Debug, Serialize)]
69pub struct FieldStats {
70 pub name: String,
71 #[serde(rename = "type")]
72 pub ty: String,
73 pub null_count: u64,
74 pub value_count: u64,
75 pub distinct_count: Option<u64>,
78 pub min: Option<JsonValue>,
80 pub max: Option<JsonValue>,
81 pub top_values: Vec<TopValue>,
84}
85
86#[derive(Debug, Serialize)]
87pub struct TopValue {
88 pub value: JsonValue,
89 pub count: u64,
90}
91
92#[derive(Debug, Serialize)]
97pub struct SerdeFeature {
98 pub fid: Option<i64>,
99 pub geometry_kind: Option<String>,
100 pub attributes: BTreeMap<String, JsonValue>,
101}
102
103#[derive(Debug, Clone, Serialize)]
107#[serde(untagged)]
108pub enum JsonValue {
109 Null,
110 Bool(bool),
111 Int(i64),
112 Float(f64),
113 String(String),
114}
115
116pub fn profile<I>(schema: &Schema, features: I, opts: ProfileOptions) -> ProfileReport
121where
122 I: IntoIterator<Item = Feature>,
123{
124 let mut field_accs: Vec<FieldAcc> = schema
125 .fields
126 .iter()
127 .map(|f| FieldAcc::new(f.name.clone(), f.ty))
128 .collect();
129
130 let mut geom = GeometryAcc::default();
131 let mut samples: Vec<SerdeFeature> = Vec::with_capacity(opts.sample_n);
132 let mut count: u64 = 0;
133
134 for feat in features {
135 if samples.len() < opts.sample_n {
136 samples.push(serialize_feature(schema, &feat));
137 }
138 geom.observe(&feat);
139 for (i, acc) in field_accs.iter_mut().enumerate() {
140 let v = feat.attributes.get(i).unwrap_or(&Value::Null);
141 acc.observe(v, &opts);
142 }
143 count += 1;
144 }
145
146 ProfileReport {
147 feature_count: count,
148 geometry: geom.finalize(),
149 fields: field_accs.into_iter().map(FieldAcc::finalize).collect(),
150 samples,
151 }
152}
153
154#[derive(Debug, Default)]
155struct GeometryAcc {
156 extent: Option<[f64; 4]>,
157 kinds: BTreeMap<String, u64>,
158 null_count: u64,
159}
160
161impl GeometryAcc {
162 fn observe(&mut self, feat: &Feature) {
163 let Some(g) = &feat.geometry else {
164 self.null_count += 1;
165 return;
166 };
167 let kind = geometry_kind_label(g_type(g));
168 *self.kinds.entry(kind.to_string()).or_insert(0) += 1;
169 if let Some(b) = g.bbox() {
170 self.extent = Some(match self.extent {
171 None => b,
172 Some(prev) => [
173 prev[0].min(b[0]),
174 prev[1].min(b[1]),
175 prev[2].max(b[2]),
176 prev[3].max(b[3]),
177 ],
178 });
179 }
180 }
181
182 fn finalize(self) -> GeometryStats {
183 GeometryStats {
184 computed_extent: self.extent,
185 kinds: self.kinds,
186 null_count: self.null_count,
187 }
188 }
189}
190
191#[derive(Debug)]
192struct FieldAcc {
193 name: String,
194 ty: ValueType,
195 null_count: u64,
196 value_count: u64,
197 counts: Option<BTreeMap<HashKey, u64>>,
198 cardinality_capped: bool,
199 min_f: Option<f64>,
200 max_f: Option<f64>,
201 min_s: Option<String>,
202 max_s: Option<String>,
203}
204
205impl FieldAcc {
206 fn new(name: String, ty: ValueType) -> Self {
207 Self {
208 name,
209 ty,
210 null_count: 0,
211 value_count: 0,
212 counts: if is_hashable(ty) {
213 Some(BTreeMap::new())
214 } else {
215 None
216 },
217 cardinality_capped: false,
218 min_f: None,
219 max_f: None,
220 min_s: None,
221 max_s: None,
222 }
223 }
224
225 fn observe(&mut self, v: &Value, opts: &ProfileOptions) {
226 if matches!(v, Value::Null) {
227 self.null_count += 1;
228 return;
229 }
230 self.value_count += 1;
231
232 if let Some(n) = as_numeric(v) {
234 if !n.is_nan() {
235 self.min_f = Some(self.min_f.map_or(n, |m| m.min(n)));
236 self.max_f = Some(self.max_f.map_or(n, |m| m.max(n)));
237 }
238 }
239 if let Value::String(s) = v {
240 self.min_s = Some(match self.min_s.take() {
241 None => s.clone(),
242 Some(prev) => {
243 if s < &prev {
244 s.clone()
245 } else {
246 prev
247 }
248 }
249 });
250 self.max_s = Some(match self.max_s.take() {
251 None => s.clone(),
252 Some(prev) => {
253 if s > &prev {
254 s.clone()
255 } else {
256 prev
257 }
258 }
259 });
260 }
261
262 if let Some(counts) = self.counts.as_mut() {
264 if let Some(key) = HashKey::from_value(v) {
265 if counts.contains_key(&key) {
266 *counts.get_mut(&key).unwrap() += 1;
267 } else if counts.len() < opts.distinct_limit {
268 counts.insert(key, 1);
269 } else {
270 self.cardinality_capped = true;
273 self.counts = None;
274 }
275 }
276 }
277 }
278
279 fn finalize(self) -> FieldStats {
280 let min = match self.ty {
281 ValueType::String => self.min_s.clone().map(JsonValue::String),
282 _ => self.min_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
283 };
284 let max = match self.ty {
285 ValueType::String => self.max_s.clone().map(JsonValue::String),
286 _ => self.max_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
287 };
288
289 let (distinct_count, top_values) = match (self.counts, self.cardinality_capped) {
290 (Some(counts), false) => {
291 let mut pairs: Vec<_> = counts.into_iter().collect();
292 pairs.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
293 let distinct = pairs.len() as u64;
294 let top = pairs
295 .into_iter()
296 .take(DEFAULT_TOP_N_FALLBACK)
297 .map(|(k, count)| TopValue {
298 value: k.into_json_value(),
299 count,
300 })
301 .collect();
302 (Some(distinct), top)
303 }
304 _ => (None, Vec::new()),
305 };
306
307 FieldStats {
308 name: self.name,
309 ty: format!("{:?}", self.ty),
310 null_count: self.null_count,
311 value_count: self.value_count,
312 distinct_count,
313 min,
314 max,
315 top_values,
316 }
317 }
318}
319
320const DEFAULT_TOP_N_FALLBACK: usize = 10;
324
325fn jsonvalue_from_numeric_typed(ty: ValueType) -> impl Fn(f64) -> Option<JsonValue> {
326 move |n: f64| match ty {
327 ValueType::Bool => Some(JsonValue::Bool(n != 0.0)),
328 ValueType::Int16 | ValueType::Int32 | ValueType::Int64 => Some(JsonValue::Int(n as i64)),
329 ValueType::Float32 | ValueType::Float64 | ValueType::DateTime => Some(JsonValue::Float(n)),
330 _ => None,
331 }
332}
333
334fn is_hashable(ty: ValueType) -> bool {
335 matches!(
336 ty,
337 ValueType::Bool
338 | ValueType::Int16
339 | ValueType::Int32
340 | ValueType::Int64
341 | ValueType::String
342 | ValueType::DateTime
343 | ValueType::Guid
344 )
345}
346
347fn as_numeric(v: &Value) -> Option<f64> {
348 match v {
349 Value::Bool(b) => Some(if *b { 1.0 } else { 0.0 }),
350 Value::Int16(n) => Some(*n as f64),
351 Value::Int32(n) => Some(*n as f64),
352 Value::Int64(n) => Some(*n as f64),
353 Value::Float32(f) => Some(*f as f64),
354 Value::Float64(f) => Some(*f),
355 Value::DateTime(d) => Some(*d),
356 _ => None,
357 }
358}
359
360#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
363enum HashKey {
364 Bool(bool),
365 Int(i64),
366 String(String),
367 DateTimeBits(u64),
368 Guid([u8; 16]),
369}
370
371impl HashKey {
372 fn from_value(v: &Value) -> Option<Self> {
373 match v {
374 Value::Bool(b) => Some(Self::Bool(*b)),
375 Value::Int16(n) => Some(Self::Int(*n as i64)),
376 Value::Int32(n) => Some(Self::Int(*n as i64)),
377 Value::Int64(n) => Some(Self::Int(*n)),
378 Value::String(s) => Some(Self::String(s.clone())),
379 Value::DateTime(d) => Some(Self::DateTimeBits(d.to_bits())),
380 Value::Guid(g) => Some(Self::Guid(*g)),
381 _ => None,
382 }
383 }
384
385 fn into_json_value(self) -> JsonValue {
386 match self {
387 Self::Bool(b) => JsonValue::Bool(b),
388 Self::Int(n) => JsonValue::Int(n),
389 Self::String(s) => JsonValue::String(s),
390 Self::DateTimeBits(bits) => JsonValue::Float(f64::from_bits(bits)),
391 Self::Guid(g) => JsonValue::String(hex_lower(&g)),
392 }
393 }
394}
395
396fn hex_lower(bytes: &[u8]) -> String {
397 let mut s = String::with_capacity(bytes.len() * 2);
398 const HEX: &[u8; 16] = b"0123456789abcdef";
399 for b in bytes {
400 s.push(HEX[(b >> 4) as usize] as char);
401 s.push(HEX[(b & 0x0f) as usize] as char);
402 }
403 s
404}
405
406fn g_type(g: &geonative_core::Geometry) -> GeometryType {
407 use geonative_core::Geometry;
408 match g {
409 Geometry::Point(_) => GeometryType::Point,
410 Geometry::LineString(_) => GeometryType::LineString,
411 Geometry::Polygon(_) => GeometryType::Polygon,
412 Geometry::MultiPoint(_) => GeometryType::MultiPoint,
413 Geometry::MultiLineString(_) => GeometryType::MultiLineString,
414 Geometry::MultiPolygon(_) => GeometryType::MultiPolygon,
415 Geometry::GeometryCollection(_) => GeometryType::GeometryCollection,
416 _ => GeometryType::GeometryCollection,
417 }
418}
419
420fn geometry_kind_label(t: GeometryType) -> &'static str {
421 match t {
422 GeometryType::Point => "Point",
423 GeometryType::LineString => "LineString",
424 GeometryType::Polygon => "Polygon",
425 GeometryType::MultiPoint => "MultiPoint",
426 GeometryType::MultiLineString => "MultiLineString",
427 GeometryType::MultiPolygon => "MultiPolygon",
428 GeometryType::GeometryCollection => "GeometryCollection",
429 _ => "Unknown",
430 }
431}
432
433fn serialize_feature(schema: &Schema, feat: &Feature) -> SerdeFeature {
434 let mut attrs = BTreeMap::new();
435 for (i, field) in schema.fields.iter().enumerate() {
436 let v = feat.attributes.get(i).unwrap_or(&Value::Null);
437 attrs.insert(field.name.clone(), value_to_json_repr(v));
438 }
439 SerdeFeature {
440 fid: feat.fid,
441 geometry_kind: feat
442 .geometry
443 .as_ref()
444 .map(|g| geometry_kind_label(g_type(g)).to_string()),
445 attributes: attrs,
446 }
447}
448
449pub fn value_to_json_repr(v: &Value) -> JsonValue {
452 match v {
453 Value::Null => JsonValue::Null,
454 Value::Bool(b) => JsonValue::Bool(*b),
455 Value::Int16(n) => JsonValue::Int(*n as i64),
456 Value::Int32(n) => JsonValue::Int(*n as i64),
457 Value::Int64(n) => JsonValue::Int(*n),
458 Value::Float32(f) => JsonValue::Float(*f as f64),
459 Value::Float64(f) => JsonValue::Float(*f),
460 Value::String(s) | Value::Xml(s) => JsonValue::String(s.clone()),
461 Value::Binary(b) => JsonValue::String(hex_lower(b)),
462 Value::DateTime(d) => JsonValue::Float(*d),
463 Value::Guid(g) => JsonValue::String(hex_lower(g)),
464 _ => JsonValue::Null,
465 }
466}
467
468#[cfg(test)]
469mod tests {
470 use super::*;
471 use geonative_core::{Coord, Crs, FieldDef, GeomField, Geometry, GeometryType, Schema};
472
473 fn mk_schema() -> Schema {
474 Schema::new(
475 vec![
476 FieldDef::new("name", ValueType::String, true),
477 FieldDef::new("score", ValueType::Int32, false),
478 FieldDef::new("weight", ValueType::Float64, true),
479 ],
480 Some(GeomField::new("geometry", GeometryType::Point)),
481 Crs::Epsg(4326),
482 )
483 }
484
485 fn pt(x: f64, y: f64) -> Geometry {
486 Geometry::Point(Coord::xy(x, y))
487 }
488
489 fn feat(
490 fid: i64,
491 name: Option<&str>,
492 score: i32,
493 weight: Option<f64>,
494 x: f64,
495 y: f64,
496 ) -> Feature {
497 let name_v = name
498 .map(|s| Value::String(s.to_string()))
499 .unwrap_or(Value::Null);
500 let weight_v = weight.map(Value::Float64).unwrap_or(Value::Null);
501 Feature::new(
502 Some(fid),
503 Some(pt(x, y)),
504 vec![name_v, Value::Int32(score), weight_v],
505 )
506 }
507
508 #[test]
509 fn counts_features_and_extent() {
510 let schema = mk_schema();
511 let feats = vec![
512 feat(1, Some("a"), 10, Some(1.0), 0.0, 0.0),
513 feat(2, Some("b"), 20, Some(2.0), 10.0, 5.0),
514 feat(3, Some("a"), 30, None, -3.0, 7.0),
515 ];
516 let report = profile(&schema, feats, ProfileOptions::default());
517 assert_eq!(report.feature_count, 3);
518 let ext = report.geometry.computed_extent.unwrap();
519 assert_eq!(ext, [-3.0, 0.0, 10.0, 7.0]);
520 assert_eq!(report.geometry.kinds.get("Point"), Some(&3));
521 assert_eq!(report.geometry.null_count, 0);
522 }
523
524 #[test]
525 fn nulls_counted_per_field() {
526 let schema = mk_schema();
527 let feats = vec![
528 feat(1, None, 10, None, 0.0, 0.0),
529 feat(2, Some("b"), 20, Some(2.0), 1.0, 1.0),
530 feat(3, None, 30, Some(3.0), 2.0, 2.0),
531 ];
532 let report = profile(&schema, feats, ProfileOptions::default());
533 let name = report.fields.iter().find(|f| f.name == "name").unwrap();
534 assert_eq!(name.null_count, 2);
535 assert_eq!(name.value_count, 1);
536 let weight = report.fields.iter().find(|f| f.name == "weight").unwrap();
537 assert_eq!(weight.null_count, 1);
538 }
539
540 #[test]
541 fn top_values_sorted_by_frequency() {
542 let schema = mk_schema();
543 let feats = vec![
544 feat(1, Some("alice"), 1, None, 0.0, 0.0),
545 feat(2, Some("bob"), 1, None, 0.0, 0.0),
546 feat(3, Some("alice"), 1, None, 0.0, 0.0),
547 feat(4, Some("alice"), 1, None, 0.0, 0.0),
548 feat(5, Some("bob"), 1, None, 0.0, 0.0),
549 ];
550 let report = profile(&schema, feats, ProfileOptions::default());
551 let name = report.fields.iter().find(|f| f.name == "name").unwrap();
552 let top = &name.top_values;
553 assert_eq!(top.len(), 2);
554 assert_eq!(top[0].count, 3);
555 assert_eq!(top[1].count, 2);
556 match &top[0].value {
557 JsonValue::String(s) => assert_eq!(s, "alice"),
558 other => panic!("expected string, got {other:?}"),
559 }
560 }
561
562 #[test]
563 fn distinct_capped_when_over_limit() {
564 let schema = Schema::new(
565 vec![FieldDef::new("id", ValueType::Int64, false)],
566 None,
567 Crs::Unknown,
568 );
569 let feats: Vec<Feature> = (0..50_i64)
570 .map(|n| Feature::new(Some(n), None, vec![Value::Int64(n)]))
571 .collect();
572 let opts = ProfileOptions {
573 distinct_limit: 10,
574 ..Default::default()
575 };
576 let report = profile(&schema, feats, opts);
577 assert_eq!(report.fields[0].distinct_count, None);
578 assert!(report.fields[0].top_values.is_empty());
579 }
580
581 #[test]
582 fn min_max_numeric() {
583 let schema = mk_schema();
584 let feats = vec![
585 feat(1, Some("a"), 5, None, 0.0, 0.0),
586 feat(2, Some("b"), -3, None, 0.0, 0.0),
587 feat(3, Some("c"), 100, None, 0.0, 0.0),
588 ];
589 let report = profile(&schema, feats, ProfileOptions::default());
590 let score = report.fields.iter().find(|f| f.name == "score").unwrap();
591 match (&score.min, &score.max) {
592 (Some(JsonValue::Int(mn)), Some(JsonValue::Int(mx))) => {
593 assert_eq!(*mn, -3);
594 assert_eq!(*mx, 100);
595 }
596 other => panic!("expected int min/max, got {other:?}"),
597 }
598 }
599
600 #[test]
601 fn samples_are_first_n() {
602 let schema = mk_schema();
603 let feats: Vec<Feature> = (0..20)
604 .map(|i| feat(i, Some(&format!("name{i}")), i as i32, None, 0.0, 0.0))
605 .collect();
606 let opts = ProfileOptions {
607 sample_n: 3,
608 ..Default::default()
609 };
610 let report = profile(&schema, feats, opts);
611 assert_eq!(report.samples.len(), 3);
612 assert_eq!(report.samples[0].fid, Some(0));
614 }
615
616 #[test]
617 fn null_geometry_counted() {
618 let schema = mk_schema();
619 let feats = vec![
620 Feature::new(
621 Some(1),
622 None,
623 vec![Value::Null, Value::Int32(1), Value::Null],
624 ),
625 Feature::new(
626 Some(2),
627 Some(pt(0.0, 0.0)),
628 vec![Value::Null, Value::Int32(2), Value::Null],
629 ),
630 ];
631 let report = profile(&schema, feats, ProfileOptions::default());
632 assert_eq!(report.geometry.null_count, 1);
633 assert_eq!(report.geometry.kinds.get("Point"), Some(&1));
634 }
635}