1use std::collections::BTreeMap;
21
22use geonative_core::{Feature, GeometryType, Schema, Value, ValueType};
23use serde::Serialize;
24
25#[derive(Debug, Clone)]
26pub struct ProfileOptions {
27 pub top_n: usize,
29 pub sample_n: usize,
31 pub distinct_limit: usize,
34}
35
36impl Default for ProfileOptions {
37 fn default() -> Self {
38 Self {
39 top_n: 10,
40 sample_n: 5,
41 distinct_limit: 10_000,
42 }
43 }
44}
45
46#[derive(Debug, Serialize)]
47pub struct ProfileReport {
48 pub feature_count: u64,
49 pub geometry: GeometryStats,
50 pub fields: Vec<FieldStats>,
51 pub samples: Vec<SerdeFeature>,
53}
54
55#[derive(Debug, Serialize)]
56pub struct GeometryStats {
57 pub computed_extent: Option<[f64; 4]>,
60 pub kinds: BTreeMap<String, u64>,
64 pub null_count: u64,
66}
67
68#[derive(Debug, Serialize)]
69pub struct FieldStats {
70 pub name: String,
71 #[serde(rename = "type")]
72 pub ty: String,
73 pub null_count: u64,
74 pub value_count: u64,
75 pub distinct_count: Option<u64>,
78 pub min: Option<JsonValue>,
80 pub max: Option<JsonValue>,
81 pub top_values: Vec<TopValue>,
84}
85
86#[derive(Debug, Serialize)]
87pub struct TopValue {
88 pub value: JsonValue,
89 pub count: u64,
90}
91
92#[derive(Debug, Serialize)]
97pub struct SerdeFeature {
98 pub fid: Option<i64>,
99 pub geometry_kind: Option<String>,
100 pub attributes: BTreeMap<String, JsonValue>,
101}
102
103#[derive(Debug, Clone, Serialize)]
107#[serde(untagged)]
108pub enum JsonValue {
109 Null,
110 Bool(bool),
111 Int(i64),
112 Float(f64),
113 String(String),
114}
115
116pub fn profile<I>(schema: &Schema, features: I, opts: ProfileOptions) -> ProfileReport
121where
122 I: IntoIterator<Item = Feature>,
123{
124 let mut field_accs: Vec<FieldAcc> = schema
125 .fields
126 .iter()
127 .map(|f| FieldAcc::new(f.name.clone(), f.ty))
128 .collect();
129
130 let mut geom = GeometryAcc::default();
131 let mut samples: Vec<SerdeFeature> = Vec::with_capacity(opts.sample_n);
132 let mut count: u64 = 0;
133
134 for feat in features {
135 if samples.len() < opts.sample_n {
136 samples.push(serialize_feature(schema, &feat));
137 }
138 geom.observe(&feat);
139 for (i, acc) in field_accs.iter_mut().enumerate() {
140 let v = feat.attributes.get(i).unwrap_or(&Value::Null);
141 acc.observe(v, &opts);
142 }
143 count += 1;
144 }
145
146 ProfileReport {
147 feature_count: count,
148 geometry: geom.finalize(),
149 fields: field_accs.into_iter().map(FieldAcc::finalize).collect(),
150 samples,
151 }
152}
153
154#[derive(Debug, Default)]
155struct GeometryAcc {
156 extent: Option<[f64; 4]>,
157 kinds: BTreeMap<String, u64>,
158 null_count: u64,
159}
160
161impl GeometryAcc {
162 fn observe(&mut self, feat: &Feature) {
163 let Some(g) = &feat.geometry else {
164 self.null_count += 1;
165 return;
166 };
167 let kind = geometry_kind_label(g_type(g));
168 *self.kinds.entry(kind.to_string()).or_insert(0) += 1;
169 if let Some(b) = g.bbox() {
170 self.extent = Some(match self.extent {
171 None => b,
172 Some(prev) => [
173 prev[0].min(b[0]),
174 prev[1].min(b[1]),
175 prev[2].max(b[2]),
176 prev[3].max(b[3]),
177 ],
178 });
179 }
180 }
181
182 fn finalize(self) -> GeometryStats {
183 GeometryStats {
184 computed_extent: self.extent,
185 kinds: self.kinds,
186 null_count: self.null_count,
187 }
188 }
189}
190
191#[derive(Debug)]
192struct FieldAcc {
193 name: String,
194 ty: ValueType,
195 null_count: u64,
196 value_count: u64,
197 counts: Option<BTreeMap<HashKey, u64>>,
198 cardinality_capped: bool,
199 min_f: Option<f64>,
200 max_f: Option<f64>,
201 min_s: Option<String>,
202 max_s: Option<String>,
203}
204
205impl FieldAcc {
206 fn new(name: String, ty: ValueType) -> Self {
207 Self {
208 name,
209 ty,
210 null_count: 0,
211 value_count: 0,
212 counts: if is_hashable(ty) {
213 Some(BTreeMap::new())
214 } else {
215 None
216 },
217 cardinality_capped: false,
218 min_f: None,
219 max_f: None,
220 min_s: None,
221 max_s: None,
222 }
223 }
224
225 fn observe(&mut self, v: &Value, opts: &ProfileOptions) {
226 if matches!(v, Value::Null) {
227 self.null_count += 1;
228 return;
229 }
230 self.value_count += 1;
231
232 if let Some(n) = as_numeric(v) {
234 if !n.is_nan() {
235 self.min_f = Some(self.min_f.map_or(n, |m| m.min(n)));
236 self.max_f = Some(self.max_f.map_or(n, |m| m.max(n)));
237 }
238 }
239 if let Value::String(s) = v {
240 self.min_s = Some(match self.min_s.take() {
241 None => s.clone(),
242 Some(prev) => {
243 if s < &prev {
244 s.clone()
245 } else {
246 prev
247 }
248 }
249 });
250 self.max_s = Some(match self.max_s.take() {
251 None => s.clone(),
252 Some(prev) => {
253 if s > &prev {
254 s.clone()
255 } else {
256 prev
257 }
258 }
259 });
260 }
261
262 if let Some(counts) = self.counts.as_mut() {
264 if let Some(key) = HashKey::from_value(v) {
265 if counts.contains_key(&key) {
266 *counts.get_mut(&key).unwrap() += 1;
267 } else if counts.len() < opts.distinct_limit {
268 counts.insert(key, 1);
269 } else {
270 self.cardinality_capped = true;
273 self.counts = None;
274 }
275 }
276 }
277 }
278
279 fn finalize(self) -> FieldStats {
280 let min = match self.ty {
281 ValueType::String => self.min_s.clone().map(JsonValue::String),
282 _ => self.min_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
283 };
284 let max = match self.ty {
285 ValueType::String => self.max_s.clone().map(JsonValue::String),
286 _ => self.max_f.and_then(jsonvalue_from_numeric_typed(self.ty)),
287 };
288
289 let (distinct_count, top_values) = match (self.counts, self.cardinality_capped) {
290 (Some(counts), false) => {
291 let mut pairs: Vec<_> = counts.into_iter().collect();
292 pairs.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
293 let distinct = pairs.len() as u64;
294 let top = pairs
295 .into_iter()
296 .take(DEFAULT_TOP_N_FALLBACK)
297 .map(|(k, count)| TopValue {
298 value: k.into_json_value(),
299 count,
300 })
301 .collect();
302 (Some(distinct), top)
303 }
304 _ => (None, Vec::new()),
305 };
306
307 FieldStats {
308 name: self.name,
309 ty: format!("{:?}", self.ty),
310 null_count: self.null_count,
311 value_count: self.value_count,
312 distinct_count,
313 min,
314 max,
315 top_values,
316 }
317 }
318}
319
320const DEFAULT_TOP_N_FALLBACK: usize = 10;
324
325fn jsonvalue_from_numeric_typed(ty: ValueType) -> impl Fn(f64) -> Option<JsonValue> {
326 move |n: f64| match ty {
327 ValueType::Bool => Some(JsonValue::Bool(n != 0.0)),
328 ValueType::Int16 | ValueType::Int32 | ValueType::Int64 => Some(JsonValue::Int(n as i64)),
329 ValueType::Float32 | ValueType::Float64 | ValueType::DateTime => Some(JsonValue::Float(n)),
330 _ => None,
331 }
332}
333
334fn is_hashable(ty: ValueType) -> bool {
335 matches!(
336 ty,
337 ValueType::Bool
338 | ValueType::Int16
339 | ValueType::Int32
340 | ValueType::Int64
341 | ValueType::String
342 | ValueType::DateTime
343 | ValueType::Guid
344 )
345}
346
347fn as_numeric(v: &Value) -> Option<f64> {
348 match v {
349 Value::Bool(b) => Some(if *b { 1.0 } else { 0.0 }),
350 Value::Int16(n) => Some(*n as f64),
351 Value::Int32(n) => Some(*n as f64),
352 Value::Int64(n) => Some(*n as f64),
353 Value::Float32(f) => Some(*f as f64),
354 Value::Float64(f) => Some(*f),
355 Value::DateTime(d) => Some(*d),
356 _ => None,
357 }
358}
359
360#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
363enum HashKey {
364 Bool(bool),
365 Int(i64),
366 String(String),
367 DateTimeBits(u64),
368 Guid([u8; 16]),
369}
370
371impl HashKey {
372 fn from_value(v: &Value) -> Option<Self> {
373 match v {
374 Value::Bool(b) => Some(Self::Bool(*b)),
375 Value::Int16(n) => Some(Self::Int(*n as i64)),
376 Value::Int32(n) => Some(Self::Int(*n as i64)),
377 Value::Int64(n) => Some(Self::Int(*n)),
378 Value::String(s) => Some(Self::String(s.clone())),
379 Value::DateTime(d) => Some(Self::DateTimeBits(d.to_bits())),
380 Value::Guid(g) => Some(Self::Guid(*g)),
381 _ => None,
382 }
383 }
384
385 fn into_json_value(self) -> JsonValue {
386 match self {
387 Self::Bool(b) => JsonValue::Bool(b),
388 Self::Int(n) => JsonValue::Int(n),
389 Self::String(s) => JsonValue::String(s),
390 Self::DateTimeBits(bits) => JsonValue::Float(f64::from_bits(bits)),
391 Self::Guid(g) => JsonValue::String(hex_lower(&g)),
392 }
393 }
394}
395
396fn hex_lower(bytes: &[u8]) -> String {
397 let mut s = String::with_capacity(bytes.len() * 2);
398 const HEX: &[u8; 16] = b"0123456789abcdef";
399 for b in bytes {
400 s.push(HEX[(b >> 4) as usize] as char);
401 s.push(HEX[(b & 0x0f) as usize] as char);
402 }
403 s
404}
405
406fn g_type(g: &geonative_core::Geometry) -> GeometryType {
407 use geonative_core::Geometry;
408 match g {
409 Geometry::Point(_) => GeometryType::Point,
410 Geometry::LineString(_) => GeometryType::LineString,
411 Geometry::Polygon(_) => GeometryType::Polygon,
412 Geometry::MultiPoint(_) => GeometryType::MultiPoint,
413 Geometry::MultiLineString(_) => GeometryType::MultiLineString,
414 Geometry::MultiPolygon(_) => GeometryType::MultiPolygon,
415 Geometry::GeometryCollection(_) => GeometryType::GeometryCollection,
416 _ => GeometryType::GeometryCollection,
417 }
418}
419
420fn geometry_kind_label(t: GeometryType) -> &'static str {
421 match t {
422 GeometryType::Point => "Point",
423 GeometryType::LineString => "LineString",
424 GeometryType::Polygon => "Polygon",
425 GeometryType::MultiPoint => "MultiPoint",
426 GeometryType::MultiLineString => "MultiLineString",
427 GeometryType::MultiPolygon => "MultiPolygon",
428 GeometryType::GeometryCollection => "GeometryCollection",
429 _ => "Unknown",
430 }
431}
432
433fn serialize_feature(schema: &Schema, feat: &Feature) -> SerdeFeature {
434 let mut attrs = BTreeMap::new();
435 for (i, field) in schema.fields.iter().enumerate() {
436 let v = feat.attributes.get(i).unwrap_or(&Value::Null);
437 attrs.insert(field.name.clone(), value_to_json_repr(v));
438 }
439 SerdeFeature {
440 fid: feat.fid,
441 geometry_kind: feat
442 .geometry
443 .as_ref()
444 .map(|g| geometry_kind_label(g_type(g)).to_string()),
445 attributes: attrs,
446 }
447}
448
449pub fn value_to_json_repr(v: &Value) -> JsonValue {
452 match v {
453 Value::Null => JsonValue::Null,
454 Value::Bool(b) => JsonValue::Bool(*b),
455 Value::Int16(n) => JsonValue::Int(*n as i64),
456 Value::Int32(n) => JsonValue::Int(*n as i64),
457 Value::Int64(n) => JsonValue::Int(*n),
458 Value::Float32(f) => JsonValue::Float(*f as f64),
459 Value::Float64(f) => JsonValue::Float(*f),
460 Value::String(s) | Value::Xml(s) => JsonValue::String(s.clone()),
461 Value::Binary(b) => JsonValue::String(hex_lower(b)),
462 Value::DateTime(d) => JsonValue::Float(*d),
463 Value::Guid(g) => JsonValue::String(hex_lower(g)),
464 _ => JsonValue::Null,
465 }
466}
467
468#[cfg(test)]
469mod tests {
470 use super::*;
471 use geonative_core::{Coord, Crs, FieldDef, GeomField, Geometry, GeometryType, Schema};
472
473 fn mk_schema() -> Schema {
474 Schema::new(
475 vec![
476 FieldDef::new("name", ValueType::String, true),
477 FieldDef::new("score", ValueType::Int32, false),
478 FieldDef::new("weight", ValueType::Float64, true),
479 ],
480 Some(GeomField::new("geometry", GeometryType::Point)),
481 Crs::Epsg(4326),
482 )
483 }
484
485 fn pt(x: f64, y: f64) -> Geometry {
486 Geometry::Point(Coord::xy(x, y))
487 }
488
489 fn feat(fid: i64, name: Option<&str>, score: i32, weight: Option<f64>, x: f64, y: f64) -> Feature {
490 let name_v = name
491 .map(|s| Value::String(s.to_string()))
492 .unwrap_or(Value::Null);
493 let weight_v = weight.map(Value::Float64).unwrap_or(Value::Null);
494 Feature::new(
495 Some(fid),
496 Some(pt(x, y)),
497 vec![name_v, Value::Int32(score), weight_v],
498 )
499 }
500
501 #[test]
502 fn counts_features_and_extent() {
503 let schema = mk_schema();
504 let feats = vec![
505 feat(1, Some("a"), 10, Some(1.0), 0.0, 0.0),
506 feat(2, Some("b"), 20, Some(2.0), 10.0, 5.0),
507 feat(3, Some("a"), 30, None, -3.0, 7.0),
508 ];
509 let report = profile(&schema, feats, ProfileOptions::default());
510 assert_eq!(report.feature_count, 3);
511 let ext = report.geometry.computed_extent.unwrap();
512 assert_eq!(ext, [-3.0, 0.0, 10.0, 7.0]);
513 assert_eq!(report.geometry.kinds.get("Point"), Some(&3));
514 assert_eq!(report.geometry.null_count, 0);
515 }
516
517 #[test]
518 fn nulls_counted_per_field() {
519 let schema = mk_schema();
520 let feats = vec![
521 feat(1, None, 10, None, 0.0, 0.0),
522 feat(2, Some("b"), 20, Some(2.0), 1.0, 1.0),
523 feat(3, None, 30, Some(3.0), 2.0, 2.0),
524 ];
525 let report = profile(&schema, feats, ProfileOptions::default());
526 let name = report.fields.iter().find(|f| f.name == "name").unwrap();
527 assert_eq!(name.null_count, 2);
528 assert_eq!(name.value_count, 1);
529 let weight = report.fields.iter().find(|f| f.name == "weight").unwrap();
530 assert_eq!(weight.null_count, 1);
531 }
532
533 #[test]
534 fn top_values_sorted_by_frequency() {
535 let schema = mk_schema();
536 let feats = vec![
537 feat(1, Some("alice"), 1, None, 0.0, 0.0),
538 feat(2, Some("bob"), 1, None, 0.0, 0.0),
539 feat(3, Some("alice"), 1, None, 0.0, 0.0),
540 feat(4, Some("alice"), 1, None, 0.0, 0.0),
541 feat(5, Some("bob"), 1, None, 0.0, 0.0),
542 ];
543 let report = profile(&schema, feats, ProfileOptions::default());
544 let name = report.fields.iter().find(|f| f.name == "name").unwrap();
545 let top = &name.top_values;
546 assert_eq!(top.len(), 2);
547 assert_eq!(top[0].count, 3);
548 assert_eq!(top[1].count, 2);
549 match &top[0].value {
550 JsonValue::String(s) => assert_eq!(s, "alice"),
551 other => panic!("expected string, got {other:?}"),
552 }
553 }
554
555 #[test]
556 fn distinct_capped_when_over_limit() {
557 let schema = Schema::new(
558 vec![FieldDef::new("id", ValueType::Int64, false)],
559 None,
560 Crs::Unknown,
561 );
562 let feats: Vec<Feature> = (0..50_i64)
563 .map(|n| Feature::new(Some(n), None, vec![Value::Int64(n)]))
564 .collect();
565 let opts = ProfileOptions {
566 distinct_limit: 10,
567 ..Default::default()
568 };
569 let report = profile(&schema, feats, opts);
570 assert_eq!(report.fields[0].distinct_count, None);
571 assert!(report.fields[0].top_values.is_empty());
572 }
573
574 #[test]
575 fn min_max_numeric() {
576 let schema = mk_schema();
577 let feats = vec![
578 feat(1, Some("a"), 5, None, 0.0, 0.0),
579 feat(2, Some("b"), -3, None, 0.0, 0.0),
580 feat(3, Some("c"), 100, None, 0.0, 0.0),
581 ];
582 let report = profile(&schema, feats, ProfileOptions::default());
583 let score = report.fields.iter().find(|f| f.name == "score").unwrap();
584 match (&score.min, &score.max) {
585 (Some(JsonValue::Int(mn)), Some(JsonValue::Int(mx))) => {
586 assert_eq!(*mn, -3);
587 assert_eq!(*mx, 100);
588 }
589 other => panic!("expected int min/max, got {other:?}"),
590 }
591 }
592
593 #[test]
594 fn samples_are_first_n() {
595 let schema = mk_schema();
596 let feats: Vec<Feature> = (0..20)
597 .map(|i| feat(i, Some(&format!("name{i}")), i as i32, None, 0.0, 0.0))
598 .collect();
599 let opts = ProfileOptions {
600 sample_n: 3,
601 ..Default::default()
602 };
603 let report = profile(&schema, feats, opts);
604 assert_eq!(report.samples.len(), 3);
605 assert_eq!(report.samples[0].fid, Some(0));
607 }
608
609 #[test]
610 fn null_geometry_counted() {
611 let schema = mk_schema();
612 let feats = vec![
613 Feature::new(Some(1), None, vec![Value::Null, Value::Int32(1), Value::Null]),
614 Feature::new(
615 Some(2),
616 Some(pt(0.0, 0.0)),
617 vec![Value::Null, Value::Int32(2), Value::Null],
618 ),
619 ];
620 let report = profile(&schema, feats, ProfileOptions::default());
621 assert_eq!(report.geometry.null_count, 1);
622 assert_eq!(report.geometry.kinds.get("Point"), Some(&1));
623 }
624}