schema_analysis/context/
string.rs

1#![allow(missing_docs)]
2
3use std::collections::BTreeMap;
4
5use once_cell::sync::Lazy;
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8
9use crate::{traits::Coalesce, Aggregate};
10
11use super::{
12    shared::{Counter, CountingSet, MinMax, Sampler},
13    Aggregators,
14};
15
16#[derive(Debug, Clone, Default, Serialize, Deserialize)]
17pub struct StringContext {
18    pub count: Counter,
19    pub samples: Sampler<String>,
20    /// Keeps track of any occurrences of strings that are known to be fishy.
21    #[serde(default, skip_serializing_if = "SuspiciousStrings::is_empty")]
22    pub suspicious_strings: SuspiciousStrings,
23    /// Runs regexes on the strings to check whether they have interesting values.
24    #[serde(default, skip_serializing_if = "SemanticExtractor::is_empty")]
25    pub semantic_extractor: SemanticExtractor,
26    pub min_max_length: MinMax<usize>,
27    #[serde(skip)]
28    pub other_aggregators: Aggregators<str>,
29}
30impl Aggregate<str> for StringContext {
31    fn aggregate(&mut self, value: &'_ str) {
32        self.count.aggregate(value);
33        self.samples.aggregate(value);
34        self.suspicious_strings.aggregate(value);
35        self.semantic_extractor.aggregate(value);
36        self.min_max_length.aggregate(&value.len());
37        self.other_aggregators.aggregate(value);
38    }
39}
40impl Coalesce for StringContext {
41    fn coalesce(&mut self, other: Self)
42    where
43        Self: Sized,
44    {
45        self.count.coalesce(other.count);
46        self.samples.coalesce(other.samples);
47        self.suspicious_strings.coalesce(other.suspicious_strings);
48        self.semantic_extractor.coalesce(other.semantic_extractor);
49        self.min_max_length.coalesce(other.min_max_length);
50        self.other_aggregators.coalesce(other.other_aggregators);
51    }
52}
53impl PartialEq for StringContext {
54    /// NOTE: [StringContext]'s [PartialEq] implementation ignores the `other_aggregators`
55    /// provided by the user of the library.
56    fn eq(&self, other: &Self) -> bool {
57        self.count == other.count
58            && self.samples == other.samples
59            && self.suspicious_strings == other.suspicious_strings
60            && self.semantic_extractor == other.semantic_extractor
61            && self.min_max_length == other.min_max_length
62    }
63}
64
65//
66// SuspiciousString
67//
68
69const NORMALIZED_SUSPICIOUS_STRINGS: &[&str] = &[
70    "n/a", "na", "nan", "null", "none", "nil", "?", "-", "/", "", " ", "  ",
71];
72/// Keeps track of any occurrences of strings that are known to be fishy,
73/// open a PR if you have more!
74#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
75pub struct SuspiciousStrings(pub CountingSet<String>);
76impl SuspiciousStrings {
77    /// Returns `true` if no suspicious strings have been found.
78    pub fn is_empty(&self) -> bool {
79        self.0.is_empty()
80    }
81}
82impl Aggregate<str> for SuspiciousStrings {
83    fn aggregate(&mut self, value: &'_ str) {
84        if NORMALIZED_SUSPICIOUS_STRINGS.contains(&value.to_lowercase().as_str()) {
85            self.0.insert(value);
86        }
87    }
88}
89impl Coalesce for SuspiciousStrings {
90    fn coalesce(&mut self, other: Self)
91    where
92        Self: Sized,
93    {
94        self.0.coalesce(other.0);
95    }
96}
97
98//
99// SemanticExtractor
100// This is a POC, more targets should be later added if it works well.
101//
102
103const RAW_SEMANTIC_TARGETS: [(&str, &str); 5] = [
104    ("Integer", r"[-+]?\d+"),
105    ("Simple Float", r"\d+[.,]\d+"),
106    ("Date 31-12-2001", r"\d{2}-\d{2}-\d{4}"),
107    ("Date 2001-12-31", r"\d{4}-\d{2}-\d{2}"),
108    // `(?i)` sets and `(?-i)` clears the case-insensitive flag.
109    ("Boolean", r"(?i)(true|yes|false|no)(?-i)"),
110];
111
112static SEMANTIC_TARGETS: Lazy<BTreeMap<String, Regex>> = Lazy::new(|| {
113    fn from_pattern(p: &str) -> Regex {
114        Regex::new(&format!(r"^\s*{}\s*$", p)).unwrap()
115    }
116    RAW_SEMANTIC_TARGETS
117        .iter()
118        .map(|(n, p)| (n.to_string(), from_pattern(p)))
119        .collect()
120});
121/// Runs regexes on the strings to check whether they have interesting values.
122#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
123pub struct SemanticExtractor(CountingSet<String>);
124impl SemanticExtractor {
125    /// Returns `true` if no interesting strings have been found.
126    pub fn is_empty(&self) -> bool {
127        self.0.is_empty()
128    }
129}
130impl Aggregate<str> for SemanticExtractor {
131    fn aggregate(&mut self, value: &'_ str) {
132        for (target, regex) in SEMANTIC_TARGETS.iter() {
133            if regex.is_match(value) {
134                self.0.insert(target);
135            }
136        }
137    }
138}
139impl Coalesce for SemanticExtractor {
140    fn coalesce(&mut self, other: Self)
141    where
142        Self: Sized,
143    {
144        self.0.coalesce(other.0);
145    }
146}