1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#![allow(missing_docs)]
use std::collections::BTreeMap;
use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::{traits::Coalesce, Aggregate};
use super::{
shared::{Counter, CountingSet, MinMax, Sampler},
Aggregators,
};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct StringContext {
pub count: Counter,
pub samples: Sampler<String>,
#[serde(default, skip_serializing_if = "SuspiciousStrings::is_empty")]
pub suspicious_strings: SuspiciousStrings,
#[serde(default, skip_serializing_if = "SemanticExtractor::is_empty")]
pub semantic_extractor: SemanticExtractor,
pub min_max_length: MinMax<usize>,
#[serde(skip)]
pub other_aggregators: Aggregators<str>,
}
impl Aggregate<str> for StringContext {
fn aggregate(&mut self, value: &'_ str) {
self.count.aggregate(value);
self.samples.aggregate(value);
self.suspicious_strings.aggregate(value);
self.semantic_extractor.aggregate(value);
self.min_max_length.aggregate(&value.len());
self.other_aggregators.aggregate(value);
}
}
impl Coalesce for StringContext {
fn coalesce(&mut self, other: Self)
where
Self: Sized,
{
self.count.coalesce(other.count);
self.samples.coalesce(other.samples);
self.suspicious_strings.coalesce(other.suspicious_strings);
self.semantic_extractor.coalesce(other.semantic_extractor);
self.min_max_length.coalesce(other.min_max_length);
self.other_aggregators.coalesce(other.other_aggregators);
}
}
impl PartialEq for StringContext {
fn eq(&self, other: &Self) -> bool {
self.count == other.count
&& self.samples == other.samples
&& self.suspicious_strings == other.suspicious_strings
&& self.semantic_extractor == other.semantic_extractor
&& self.min_max_length == other.min_max_length
}
}
const NORMALIZED_SUSPICIOUS_STRINGS: &[&str] = &[
"n/a", "na", "nan", "null", "none", "nil", "?", "-", "/", "", " ", " ",
];
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct SuspiciousStrings(pub CountingSet<String>);
impl SuspiciousStrings {
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl Aggregate<str> for SuspiciousStrings {
fn aggregate(&mut self, value: &'_ str) {
if NORMALIZED_SUSPICIOUS_STRINGS.contains(&value.to_lowercase().as_str()) {
self.0.insert(value);
}
}
}
impl Coalesce for SuspiciousStrings {
fn coalesce(&mut self, other: Self)
where
Self: Sized,
{
self.0.coalesce(other.0);
}
}
const RAW_SEMANTIC_TARGETS: [(&str, &str); 5] = [
("Integer", r"[-+]?\d+"),
("Simple Float", r"\d+[.,]\d+"),
("Date 31-12-2001", r"\d{2}-\d{2}-\d{4}"),
("Date 2001-12-31", r"\d{4}-\d{2}-\d{2}"),
("Boolean", r"(?i)(true|yes|false|no)(?-i)"),
];
static SEMANTIC_TARGETS: Lazy<BTreeMap<String, Regex>> = Lazy::new(|| {
fn from_pattern(p: &str) -> Regex {
Regex::new(&format!(r"^\s*{}\s*$", p)).unwrap()
}
RAW_SEMANTIC_TARGETS
.iter()
.map(|(n, p)| (n.to_string(), from_pattern(p)))
.collect()
});
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct SemanticExtractor(CountingSet<String>);
impl SemanticExtractor {
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl Aggregate<str> for SemanticExtractor {
fn aggregate(&mut self, value: &'_ str) {
for (target, regex) in SEMANTIC_TARGETS.iter() {
if regex.is_match(value) {
self.0.insert(target);
}
}
}
}
impl Coalesce for SemanticExtractor {
fn coalesce(&mut self, other: Self)
where
Self: Sized,
{
self.0.coalesce(other.0);
}
}