1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#![allow(missing_docs)]

use std::collections::BTreeMap;

use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};

use crate::{traits::Coalesce, Aggregate};

use super::{
    shared::{Counter, CountingSet, MinMax, Sampler},
    Aggregators,
};

#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct StringContext {
    pub count: Counter,
    pub samples: Sampler<String>,
    /// Keeps track of any occurrences of strings that are known to be fishy.
    #[serde(default, skip_serializing_if = "SuspiciousStrings::is_empty")]
    pub suspicious_strings: SuspiciousStrings,
    /// Runs regexes on the strings to check whether they have interesting values.
    #[serde(default, skip_serializing_if = "SemanticExtractor::is_empty")]
    pub semantic_extractor: SemanticExtractor,
    pub min_max_length: MinMax<usize>,
    #[serde(skip)]
    pub other_aggregators: Aggregators<str>,
}
impl Aggregate<str> for StringContext {
    fn aggregate(&mut self, value: &'_ str) {
        self.count.aggregate(value);
        self.samples.aggregate(value);
        self.suspicious_strings.aggregate(value);
        self.semantic_extractor.aggregate(value);
        self.min_max_length.aggregate(&value.len());
        self.other_aggregators.aggregate(value);
    }
}
impl Coalesce for StringContext {
    fn coalesce(&mut self, other: Self)
    where
        Self: Sized,
    {
        self.count.coalesce(other.count);
        self.samples.coalesce(other.samples);
        self.suspicious_strings.coalesce(other.suspicious_strings);
        self.semantic_extractor.coalesce(other.semantic_extractor);
        self.min_max_length.coalesce(other.min_max_length);
        self.other_aggregators.coalesce(other.other_aggregators);
    }
}
impl PartialEq for StringContext {
    /// NOTE: [StringContext]'s [PartialEq] implementation ignores the `other_aggregators`
    /// provided by the user of the library.
    fn eq(&self, other: &Self) -> bool {
        self.count == other.count
            && self.samples == other.samples
            && self.suspicious_strings == other.suspicious_strings
            && self.semantic_extractor == other.semantic_extractor
            && self.min_max_length == other.min_max_length
    }
}

//
// SuspiciousString
//

const NORMALIZED_SUSPICIOUS_STRINGS: &[&str] = &[
    "n/a", "na", "nan", "null", "none", "nil", "?", "-", "/", "", " ", "  ",
];
/// Keeps track of any occurrences of strings that are known to be fishy,
/// open a PR if you have more!
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct SuspiciousStrings(pub CountingSet<String>);
impl SuspiciousStrings {
    /// Returns `true` if no suspicious strings have been found.
    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }
}
impl Aggregate<str> for SuspiciousStrings {
    fn aggregate(&mut self, value: &'_ str) {
        if NORMALIZED_SUSPICIOUS_STRINGS.contains(&value.to_lowercase().as_str()) {
            self.0.insert(value);
        }
    }
}
impl Coalesce for SuspiciousStrings {
    fn coalesce(&mut self, other: Self)
    where
        Self: Sized,
    {
        self.0.coalesce(other.0);
    }
}

//
// SemanticExtractor
// This is a POC, more targets should be later added if it works well.
//

const RAW_SEMANTIC_TARGETS: [(&str, &str); 5] = [
    ("Integer", r"[-+]?\d+"),
    ("Simple Float", r"\d+[.,]\d+"),
    ("Date 31-12-2001", r"\d{2}-\d{2}-\d{4}"),
    ("Date 2001-12-31", r"\d{4}-\d{2}-\d{2}"),
    // `(?i)` sets and `(?-i)` clears the case-insensitive flag.
    ("Boolean", r"(?i)(true|yes|false|no)(?-i)"),
];

static SEMANTIC_TARGETS: Lazy<BTreeMap<String, Regex>> = Lazy::new(|| {
    fn from_pattern(p: &str) -> Regex {
        Regex::new(&format!(r"^\s*{}\s*$", p)).unwrap()
    }
    RAW_SEMANTIC_TARGETS
        .iter()
        .map(|(n, p)| (n.to_string(), from_pattern(p)))
        .collect()
});
/// Runs regexes on the strings to check whether they have interesting values.
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct SemanticExtractor(CountingSet<String>);
impl SemanticExtractor {
    /// Returns `true` if no interesting strings have been found.
    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }
}
impl Aggregate<str> for SemanticExtractor {
    fn aggregate(&mut self, value: &'_ str) {
        for (target, regex) in SEMANTIC_TARGETS.iter() {
            if regex.is_match(value) {
                self.0.insert(target);
            }
        }
    }
}
impl Coalesce for SemanticExtractor {
    fn coalesce(&mut self, other: Self)
    where
        Self: Sized,
    {
        self.0.coalesce(other.0);
    }
}