schema_analysis/context/
string.rs1#![allow(missing_docs)]
2
3use std::collections::BTreeMap;
4
5use once_cell::sync::Lazy;
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8
9use crate::{traits::Coalesce, Aggregate};
10
11use super::{
12 shared::{Counter, CountingSet, MinMax, Sampler},
13 Aggregators,
14};
15
16#[derive(Debug, Clone, Default, Serialize, Deserialize)]
17pub struct StringContext {
18 pub count: Counter,
19 pub samples: Sampler<String>,
20 #[serde(default, skip_serializing_if = "SuspiciousStrings::is_empty")]
22 pub suspicious_strings: SuspiciousStrings,
23 #[serde(default, skip_serializing_if = "SemanticExtractor::is_empty")]
25 pub semantic_extractor: SemanticExtractor,
26 pub min_max_length: MinMax<usize>,
27 #[serde(skip)]
28 pub other_aggregators: Aggregators<str>,
29}
30impl Aggregate<str> for StringContext {
31 fn aggregate(&mut self, value: &'_ str) {
32 self.count.aggregate(value);
33 self.samples.aggregate(value);
34 self.suspicious_strings.aggregate(value);
35 self.semantic_extractor.aggregate(value);
36 self.min_max_length.aggregate(&value.len());
37 self.other_aggregators.aggregate(value);
38 }
39}
40impl Coalesce for StringContext {
41 fn coalesce(&mut self, other: Self)
42 where
43 Self: Sized,
44 {
45 self.count.coalesce(other.count);
46 self.samples.coalesce(other.samples);
47 self.suspicious_strings.coalesce(other.suspicious_strings);
48 self.semantic_extractor.coalesce(other.semantic_extractor);
49 self.min_max_length.coalesce(other.min_max_length);
50 self.other_aggregators.coalesce(other.other_aggregators);
51 }
52}
53impl PartialEq for StringContext {
54 fn eq(&self, other: &Self) -> bool {
57 self.count == other.count
58 && self.samples == other.samples
59 && self.suspicious_strings == other.suspicious_strings
60 && self.semantic_extractor == other.semantic_extractor
61 && self.min_max_length == other.min_max_length
62 }
63}
64
65const NORMALIZED_SUSPICIOUS_STRINGS: &[&str] = &[
70 "n/a", "na", "nan", "null", "none", "nil", "?", "-", "/", "", " ", " ",
71];
72#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
75pub struct SuspiciousStrings(pub CountingSet<String>);
76impl SuspiciousStrings {
77 pub fn is_empty(&self) -> bool {
79 self.0.is_empty()
80 }
81}
82impl Aggregate<str> for SuspiciousStrings {
83 fn aggregate(&mut self, value: &'_ str) {
84 if NORMALIZED_SUSPICIOUS_STRINGS.contains(&value.to_lowercase().as_str()) {
85 self.0.insert(value);
86 }
87 }
88}
89impl Coalesce for SuspiciousStrings {
90 fn coalesce(&mut self, other: Self)
91 where
92 Self: Sized,
93 {
94 self.0.coalesce(other.0);
95 }
96}
97
98const RAW_SEMANTIC_TARGETS: [(&str, &str); 5] = [
104 ("Integer", r"[-+]?\d+"),
105 ("Simple Float", r"\d+[.,]\d+"),
106 ("Date 31-12-2001", r"\d{2}-\d{2}-\d{4}"),
107 ("Date 2001-12-31", r"\d{4}-\d{2}-\d{2}"),
108 ("Boolean", r"(?i)(true|yes|false|no)(?-i)"),
110];
111
112static SEMANTIC_TARGETS: Lazy<BTreeMap<String, Regex>> = Lazy::new(|| {
113 fn from_pattern(p: &str) -> Regex {
114 Regex::new(&format!(r"^\s*{}\s*$", p)).unwrap()
115 }
116 RAW_SEMANTIC_TARGETS
117 .iter()
118 .map(|(n, p)| (n.to_string(), from_pattern(p)))
119 .collect()
120});
121#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
123pub struct SemanticExtractor(CountingSet<String>);
124impl SemanticExtractor {
125 pub fn is_empty(&self) -> bool {
127 self.0.is_empty()
128 }
129}
130impl Aggregate<str> for SemanticExtractor {
131 fn aggregate(&mut self, value: &'_ str) {
132 for (target, regex) in SEMANTIC_TARGETS.iter() {
133 if regex.is_match(value) {
134 self.0.insert(target);
135 }
136 }
137 }
138}
139impl Coalesce for SemanticExtractor {
140 fn coalesce(&mut self, other: Self)
141 where
142 Self: Sized,
143 {
144 self.0.coalesce(other.0);
145 }
146}