#![allow(missing_docs)]
use std::collections::BTreeMap;
use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::{traits::Coalesce, Aggregate};
use super::{
shared::{Counter, CountingSet, MinMax, Sampler},
Aggregators,
};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct StringContext {
pub count: Counter,
pub samples: Sampler<String>,
#[serde(default, skip_serializing_if = "SuspiciousStrings::is_empty")]
pub suspicious_strings: SuspiciousStrings,
#[serde(default, skip_serializing_if = "SemanticExtractor::is_empty")]
pub semantic_extractor: SemanticExtractor,
pub min_max_length: MinMax<usize>,
#[serde(skip)]
pub other_aggregators: Aggregators<str>,
}
impl Aggregate<str> for StringContext {
fn aggregate(&mut self, value: &'_ str) {
self.count.aggregate(value);
self.samples.aggregate(value);
self.suspicious_strings.aggregate(value);
self.semantic_extractor.aggregate(value);
self.min_max_length.aggregate(&value.len());
self.other_aggregators.aggregate(value);
}
}
impl Coalesce for StringContext {
fn coalesce(&mut self, other: Self)
where
Self: Sized,
{
self.count.coalesce(other.count);
self.samples.coalesce(other.samples);
self.suspicious_strings.coalesce(other.suspicious_strings);
self.semantic_extractor.coalesce(other.semantic_extractor);
self.min_max_length.coalesce(other.min_max_length);
self.other_aggregators.coalesce(other.other_aggregators);
}
}
impl PartialEq for StringContext {
fn eq(&self, other: &Self) -> bool {
self.count == other.count
&& self.samples == other.samples
&& self.suspicious_strings == other.suspicious_strings
&& self.semantic_extractor == other.semantic_extractor
&& self.min_max_length == other.min_max_length
}
}
const NORMALIZED_SUSPICIOUS_STRINGS: &[&str] = &[
"n/a", "na", "nan", "null", "none", "nil", "?", "-", "/", "", " ", " ",
];
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct SuspiciousStrings(pub CountingSet<String>);
impl SuspiciousStrings {
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl Aggregate<str> for SuspiciousStrings {
fn aggregate(&mut self, value: &'_ str) {
if NORMALIZED_SUSPICIOUS_STRINGS.contains(&value.to_lowercase().as_str()) {
self.0.insert(value);
}
}
}
impl Coalesce for SuspiciousStrings {
fn coalesce(&mut self, other: Self)
where
Self: Sized,
{
self.0.coalesce(other.0);
}
}
const RAW_SEMANTIC_TARGETS: [(&str, &str); 5] = [
("Integer", r"[-+]?\d+"),
("Simple Float", r"\d+[.,]\d+"),
("Date 31-12-2001", r"\d{2}-\d{2}-\d{4}"),
("Date 2001-12-31", r"\d{4}-\d{2}-\d{2}"),
("Boolean", r"(?i)(true|yes|false|no)(?-i)"),
];
static SEMANTIC_TARGETS: Lazy<BTreeMap<String, Regex>> = Lazy::new(|| {
fn from_pattern(p: &str) -> Regex {
Regex::new(&format!(r"^\s*{}\s*$", p)).unwrap()
}
RAW_SEMANTIC_TARGETS
.iter()
.map(|(n, p)| (n.to_string(), from_pattern(p)))
.collect()
});
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct SemanticExtractor(CountingSet<String>);
impl SemanticExtractor {
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl Aggregate<str> for SemanticExtractor {
fn aggregate(&mut self, value: &'_ str) {
for (target, regex) in SEMANTIC_TARGETS.iter() {
if regex.is_match(value) {
self.0.insert(target);
}
}
}
}
impl Coalesce for SemanticExtractor {
fn coalesce(&mut self, other: Self)
where
Self: Sized,
{
self.0.coalesce(other.0);
}
}