use lazy_static::lazy_static;
use regex::Regex;
use std::borrow::Cow;
lazy_static! {
static ref PHONE : Regex = Regex::new(r#"(\+\d{1,2})?\s*\(?\d{3}\)?[\s\.-]*\d{3}[\s\.-]*\d{4}"#).unwrap();
static ref IP_ADDRESS : Regex = Regex::new(r#"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"#).unwrap();
static ref EMAIL_ADDRESS : Regex = Regex::new(r#"(?i)[a-z0-9_\-]{3,}\s*(@|[\[\(\s]at[\s\)\]])\s*[a-z0-9_\-]{5,}\s*(\.|dot)\s*(com|net|org|gov|biz|co|us|ru|uk|de|se|to|tv|io|info|online|site)"#).unwrap();
static ref NAME : Regex = Regex::new(r#"(?i)(real\s)?name\s+is:?\s[a-zA-Z]+(\s[a-zA-z]+)?"#).unwrap();
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]{3,}\.(com|net|org|gov|biz|co|us|ru|cc|uk|de|se|to|tv|io|gg|info|online|site|link)"#).unwrap();
}
pub fn censor_and_analyze_pii(s: &str) -> (String, bool) {
let ret = Cow::Borrowed(s);
let mut censored = false;
let ret = PHONE.replace_all(&ret, "***-****-****");
censored |= matches!(ret, Cow::Owned(_));
let ret = IP_ADDRESS.replace_all(&ret, "***.***.***.***");
censored |= matches!(ret, Cow::Owned(_));
let ret = EMAIL_ADDRESS.replace_all(&ret, "****@*****.***");
censored |= matches!(ret, Cow::Owned(_));
let ret = NAME.replace_all(&ret, "name is ***** *****");
censored |= matches!(ret, Cow::Owned(_));
let ret = URL.replace_all(&ret, "******.***");
censored |= matches!(ret, Cow::Owned(_));
(ret.into_owned(), censored)
}
#[cfg(test)]
mod tests {
use super::censor_and_analyze_pii;
fn censor_pii(s: &str) -> String {
censor_and_analyze_pii(s).0
}
fn has_pii(s: &str) -> bool {
censor_and_analyze_pii(s).1
}
#[test]
fn pii() {
let pii = r#"
hello@gmail.com
hello f00 @ gmail.com
sus@yahoo.biz sus
foo[at]yahoo.com
foo [at] yahoo dot com
foo at yahoo dot com
foo @ twitch.tv
foo AT twitch.tv
1234567890
(123)4567890
+1 1234567890
+1 (123) 4567890
+12 (123) 456 7890
+1 (123) 456-7890
+1 123-456-7890
+1 123.456.7890
123.123.123.123
8.8.8.8
999.999.999.999
my name is: ALEX Smith
my real name is Alex smith
his name is alex smith
her real name is alex Smith
my name is alex. smith
hello.com
http://hello.com
https://foooo.com
barrr.com
example.org
twitch.tv
http:/chat.dev
https://w2g.tv/?r=
"#;
for line in pii.lines() {
if line.trim().is_empty() {
continue;
}
assert!(has_pii(line), "{line}");
}
println!("{}", censor_pii(pii));
}
#[test]
fn not_pii() {
for line in include_str!("./safe.txt")
.lines()
.chain(include_str!("./false_positives.txt").lines())
.chain(
r#"1234 Have 1234
gmail.zzz"#
.lines(),
)
{
assert!(!has_pii(line), "{line}");
}
assert!(!has_pii("123 i have 4"));
}
#[test]
fn censor_pii_test() {
assert_eq!(
censor_pii("mail me at foo@barrr.com, bye"),
"mail me at ****@*****.***, bye"
);
}
}