use once_cell::sync::Lazy;
use regex::{Captures, Regex};
use rhai::{Dynamic, Engine, Map};
use std::collections::HashMap;
fn is_valid_ssn(matched: &str) -> bool {
let parts: Vec<&str> = matched.split('-').collect();
if parts.len() != 3 {
return false;
}
let area: u16 = match parts[0].parse() {
Ok(v) => v,
Err(_) => return false,
};
let group: u16 = match parts[1].parse() {
Ok(v) => v,
Err(_) => return false,
};
let serial: u16 = match parts[2].parse() {
Ok(v) => v,
Err(_) => return false,
};
if area == 0 || area == 666 || area >= 900 {
return false;
}
if group == 0 || serial == 0 {
return false;
}
true
}
fn is_valid_phone(matched: &str) -> bool {
let digits: Vec<u8> = matched
.chars()
.filter(|c| c.is_ascii_digit())
.map(|c| c as u8 - b'0')
.collect();
let national: &[u8] = if digits.len() == 11 && digits[0] == 1 {
&digits[1..]
} else if digits.len() == 10 {
&digits
} else {
return digits.len() >= 7;
};
if national.len() != 10 {
return false;
}
let area_first = national[0];
let exchange_first = national[3];
if area_first < 2 || exchange_first < 2 {
return false;
}
if national[0] == 5
&& national[1] == 5
&& national[2] == 5
&& national[3] == 0
&& national[4] == 1
&& national[5] < 2
{
return false;
}
true
}
fn is_valid_luhn(digits: &str) -> bool {
let digits: Vec<u32> = digits
.chars()
.filter(|c| c.is_ascii_digit())
.filter_map(|c| c.to_digit(10))
.collect();
if digits.len() < 13 || digits.len() > 19 {
return false;
}
if digits.iter().all(|&d| d == 0) {
return false;
}
let sum: u32 = digits
.iter()
.rev()
.enumerate()
.map(|(i, &d)| {
if i % 2 == 1 {
let doubled = d * 2;
if doubled > 9 {
doubled - 9
} else {
doubled
}
} else {
d
}
})
.sum();
sum % 10 == 0
}
static PATTERNS: Lazy<HashMap<&'static str, Vec<Regex>>> = Lazy::new(|| {
let mut map = HashMap::new();
let octet = r"(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)";
map.insert(
"ipv4",
vec![Regex::new(&format!(r"\b{octet}\.{octet}\.{octet}\.{octet}\b")).unwrap()],
);
map.insert(
"ipv4_port",
vec![Regex::new(&format!(
r"\b{octet}\.{octet}\.{octet}\.{octet}:(?:0|[1-9]\d{{0,3}}|[1-5]\d{{4}}|6[0-4]\d{{3}}|65[0-4]\d{{2}}|655[0-2]\d|6553[0-5])\b"
))
.unwrap()],
);
map.insert(
"ipv6",
vec![Regex::new(
r"(?i)\b(?:[0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}\b|(?:[0-9A-Fa-f]{1,4}:){1,6}:[0-9A-Fa-f]{1,4}|(?:[0-9A-Fa-f]{1,4}:){1,5}(?::[0-9A-Fa-f]{1,4}){1,2}|fe80:(?::[0-9A-Fa-f]{0,4}){0,4}%[0-9A-Za-z]{1,}|::(?:ffff:)?(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|1?\d?\d)"
)
.unwrap()],
);
map.insert(
"email",
vec![Regex::new(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b").unwrap()],
);
map.insert(
"url",
vec![Regex::new(
r"\b(?:[a-z][a-z0-9+.-]*):\/\/(?:(?:[^\s:@]+(?::[^\s:@]*)?@)?(?:[^\s:/?#]+)(?::\d+)?(?:\/[^\s?#]*)?(?:\?[^\s#]*)?(?:#[^\s]*)?)\b"
)
.unwrap()],
);
map.insert(
"fqdn",
vec?\.){2,}[a-z0-9][a-z0-9-]{0,8}\b")
.unwrap(),
],
);
map.insert(
"uuid",
vec![Regex::new(
r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b",
)
.unwrap()],
);
map.insert(
"mac",
vec![
Regex::new(r"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b").unwrap(),
Regex::new(r"\b(?:[0-9A-Fa-f]{4}\.){2}[0-9A-Fa-f]{4}\b").unwrap(),
],
);
map.insert("md5", vec![Regex::new(r"\b[a-fA-F0-9]{32}\b").unwrap()]);
map.insert("sha1", vec![Regex::new(r"\b[a-fA-F0-9]{40}\b").unwrap()]);
map.insert("sha256", vec![Regex::new(r"\b[a-fA-F0-9]{64}\b").unwrap()]);
map.insert("path", vec![Regex::new(r"\B(/[\w./-]+)").unwrap()]);
map.insert(
"oauth",
vec![Regex::new(r"\bya29\.[0-9A-Za-z_-]+\b").unwrap()],
);
map.insert("function", vec![Regex::new(r"\b[\w\.]+\([^)]*\)").unwrap()]);
map.insert("hexcolor", vec![Regex::new(r"#[0-9A-Fa-f]{6}\b").unwrap()]);
map.insert(
"version",
vec![Regex::new(r"\b[vV]\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9]+)?\b").unwrap()],
);
map.insert("hexnum", vec![Regex::new(r"\b0x[0-9a-fA-F]+\b").unwrap()]);
map.insert(
"duration",
vec![
Regex::new(r"\b\d+(?:\.\d+)?(?:us|ms|[smhd])\b").unwrap(),
Regex::new(r"\b\d+(?:\.\d+)?\s*(?:microsecond|millisecond|second|minute|hour|day|week|month|year)s?\b").unwrap(),
Regex::new(r"\b(?:\d+h\d+m\d+s|\d+h\d+m|\d+h\d+s|\d+m\d+s)\b").unwrap(),
],
);
map.insert(
"num",
vec![Regex::new(r"[+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?").unwrap()],
);
map.insert(
"credit_card",
vec![Regex::new(r"\b\d(?:[ -]?\d){12,18}\b").unwrap()],
);
map.insert("ssn", vec![Regex::new(r"\b\d{3}-\d{2}-\d{4}\b").unwrap()]);
map.insert(
"phone",
vec![
Regex::new(r"\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}").unwrap(),
Regex::new(r"\(\d{3}\)[-.\s]?\d{3}[-.\s]?\d{4}").unwrap(),
Regex::new(r"\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b").unwrap(),
],
);
map
});
type Validator = fn(&str) -> bool;
static VALIDATORS: Lazy<HashMap<&'static str, Validator>> = Lazy::new(|| {
let mut map: HashMap<&'static str, Validator> = HashMap::new();
map.insert("credit_card", is_valid_luhn as Validator);
map.insert("ssn", is_valid_ssn as Validator);
map.insert("phone", is_valid_phone as Validator);
map
});
const DEFAULT_PATTERNS: &[&str] = &[
"ipv4_port",
"ipv4",
"ipv6",
"email",
"url",
"fqdn",
"uuid",
"mac",
"md5",
"sha1",
"sha256",
"path",
"oauth",
"function",
"hexcolor",
"version",
];
fn parse_patterns(spec: Dynamic) -> Result<Vec<String>, String> {
if spec.is_string() {
let s = spec
.into_string()
.map_err(|_| "Failed to convert to string")?;
Ok(s.split(',').map(|p| p.trim().to_string()).collect())
} else if spec.is_array() {
let arr = spec
.into_array()
.map_err(|_| "Failed to convert to array")?;
arr.into_iter()
.map(|v| {
v.into_string()
.map_err(|_| "Array element is not a string".to_string())
})
.collect()
} else {
Err("Pattern spec must be a string or array".to_string())
}
}
fn normalized_str_impl(text: &str, patterns: &[String]) -> String {
let mut result = text.to_string();
let mut replacements: Vec<(char, String)> = Vec::new();
for (idx, pattern_name) in patterns.iter().enumerate() {
let placeholder = format!("<{}>", pattern_name);
let validator = VALIDATORS.get(pattern_name.as_str()).copied();
if let Some(regexes) = PATTERNS.get(pattern_name.as_str()) {
for regex in regexes {
if let Some(marker) = char::from_u32(0xE000 + idx as u32) {
replacements.push((marker, placeholder.clone()));
if let Some(validate) = validator {
result = regex
.replace_all(&result, |caps: &Captures| {
let matched = &caps[0];
if validate(matched) {
marker.to_string()
} else {
matched.to_string()
}
})
.to_string();
} else {
result = regex.replace_all(&result, marker.to_string()).to_string();
}
}
}
}
}
for (marker, placeholder) in replacements {
result = result.replace(marker, &placeholder);
}
result
}
fn normalized_str_default(text: &str) -> String {
let patterns: Vec<String> = DEFAULT_PATTERNS.iter().map(|s| s.to_string()).collect();
normalized_str_impl(text, &patterns)
}
fn normalized_str_with_patterns(
text: &str,
spec: Dynamic,
) -> Result<String, Box<rhai::EvalAltResult>> {
let patterns = parse_patterns(spec).map_err(|e| {
Box::new(rhai::EvalAltResult::ErrorRuntime(
e.into(),
rhai::Position::NONE,
))
})?;
Ok(normalized_str_impl(text, &patterns))
}
fn normalized_map_impl(map: &mut Map, patterns: &[String]) {
for (_key, value) in map.iter_mut() {
if value.is_string() {
if let Ok(s) = value.clone().into_string() {
*value = Dynamic::from(normalized_str_impl(&s, patterns));
}
} else if value.is_map() {
if let Some(mut nested_map) = value.clone().try_cast::<Map>() {
normalized_map_impl(&mut nested_map, patterns);
*value = Dynamic::from(nested_map);
}
} else if value.is_array() {
if let Ok(mut arr) = value.clone().into_array() {
for item in arr.iter_mut() {
if item.is_string() {
if let Ok(s) = item.clone().into_string() {
*item = Dynamic::from(normalized_str_impl(&s, patterns));
}
} else if item.is_map() {
if let Some(mut nested_map) = item.clone().try_cast::<Map>() {
normalized_map_impl(&mut nested_map, patterns);
*item = Dynamic::from(nested_map);
}
}
}
*value = Dynamic::from(arr);
}
}
}
}
fn normalized_map_default(mut map: Map) -> Map {
let patterns: Vec<String> = DEFAULT_PATTERNS.iter().map(|s| s.to_string()).collect();
normalized_map_impl(&mut map, &patterns);
map
}
fn normalized_map_with_patterns(
mut map: Map,
spec: Dynamic,
) -> Result<Map, Box<rhai::EvalAltResult>> {
let patterns = parse_patterns(spec).map_err(|e| {
Box::new(rhai::EvalAltResult::ErrorRuntime(
e.into(),
rhai::Position::NONE,
))
})?;
normalized_map_impl(&mut map, &patterns);
Ok(map)
}
pub fn register_functions(engine: &mut Engine) {
engine.register_fn("normalized", normalized_str_default);
engine.register_fn("normalized", normalized_str_with_patterns);
engine.register_fn("normalized", normalized_map_default);
engine.register_fn("normalized", normalized_map_with_patterns);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalized_ipv4() {
let result = normalized_str_impl("Server at 192.168.1.100 failed", &["ipv4".to_string()]);
assert_eq!(result, "Server at <ipv4> failed");
}
#[test]
fn test_normalized_email() {
let result =
normalized_str_impl("Contact user@example.com for help", &["email".to_string()]);
assert_eq!(result, "Contact <email> for help");
}
#[test]
fn test_normalized_url() {
let result = normalized_str_impl("Visit https://example.com/path", &["url".to_string()]);
assert_eq!(result, "Visit <url>");
}
#[test]
fn test_normalized_uuid() {
let result = normalized_str_impl(
"Request 550e8400-e29b-41d4-a716-446655440000 processed",
&["uuid".to_string()],
);
assert_eq!(result, "Request <uuid> processed");
}
#[test]
fn test_normalized_multiple_patterns() {
let result = normalized_str_impl(
"User user@example.com from 10.0.0.5 accessed https://api.example.com",
&["ipv4".to_string(), "email".to_string(), "url".to_string()],
);
assert_eq!(result, "User <email> from <ipv4> accessed <url>");
}
#[test]
fn test_normalized_default_patterns() {
let result = normalized_str_default(
"User user@example.com from 192.168.1.1 with UUID 550e8400-e29b-41d4-a716-446655440000",
);
assert!(result.contains("<email>"));
assert!(result.contains("<ipv4>"));
assert!(result.contains("<uuid>"));
}
#[test]
fn test_parse_patterns_csv() {
let spec = Dynamic::from("ipv4,email,url");
let patterns = parse_patterns(spec).unwrap();
assert_eq!(patterns, vec!["ipv4", "email", "url"]);
}
#[test]
fn test_parse_patterns_array() {
let arr = vec![Dynamic::from("ipv4"), Dynamic::from("email")];
let spec = Dynamic::from(arr);
let patterns = parse_patterns(spec).unwrap();
assert_eq!(patterns, vec!["ipv4", "email"]);
}
#[test]
fn test_normalized_map_basic() {
let mut map = Map::new();
map.insert("message".into(), Dynamic::from("IP: 192.168.1.1"));
map.insert("email".into(), Dynamic::from("test@example.com"));
let patterns = vec!["ipv4".to_string(), "email".to_string()];
let mut result = map.clone();
normalized_map_impl(&mut result, &patterns);
assert_eq!(
result
.get("message")
.unwrap()
.clone()
.into_string()
.unwrap(),
"IP: <ipv4>"
);
assert_eq!(
result.get("email").unwrap().clone().into_string().unwrap(),
"<email>"
);
}
#[test]
fn test_two_pass_no_corruption() {
let result = normalized_str_impl(
"email: user@example.com color: #FF0000",
&["email".to_string(), "hexcolor".to_string()],
);
assert_eq!(result, "email: <email> color: <hexcolor>");
assert!(!result.contains("<hexcol<email>"));
}
#[test]
fn test_normalized_hash_values() {
let md5 = "5d41402abc4b2a76b9719d911017c592";
let result = normalized_str_impl(&format!("MD5: {}", md5), &["md5".to_string()]);
assert_eq!(result, "MD5: <md5>");
let sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
let result = normalized_str_impl(&format!("SHA256: {}", sha256), &["sha256".to_string()]);
assert_eq!(result, "SHA256: <sha256>");
}
#[test]
fn test_normalized_version() {
let result = normalized_str_impl("Version v1.2.3 released", &["version".to_string()]);
assert_eq!(result, "Version <version> released");
}
#[test]
fn test_normalized_mac_address() {
let result = normalized_str_impl("MAC: 00:1A:2B:3C:4D:5E", &["mac".to_string()]);
assert_eq!(result, "MAC: <mac>");
let result = normalized_str_impl("MAC: 001A.2B3C.4D5E", &["mac".to_string()]);
assert_eq!(result, "MAC: <mac>");
}
#[test]
fn test_normalized_num_no_leading_dot() {
let result = normalized_str_impl("value .113 found", &["num".to_string()]);
assert_eq!(result, "value .<num> found");
let result = normalized_str_impl("value 0.113 found", &["num".to_string()]);
assert_eq!(result, "value <num> found");
}
#[test]
fn test_normalized_num_with_ip_address() {
let result = normalized_str_impl("Server at 203.0.113.1 failed", &["num".to_string()]);
assert_eq!(result, "Server at <num>.<num> failed");
let result = normalized_str_impl("Server at 203.0.113.1 failed", &["ipv4".to_string()]);
assert_eq!(result, "Server at <ipv4> failed");
}
#[test]
fn test_normalized_num_valid_numbers() {
let result = normalized_str_impl("count: 42", &["num".to_string()]);
assert_eq!(result, "count: <num>");
let result = normalized_str_impl("pi: 3.14159", &["num".to_string()]);
assert_eq!(result, "pi: <num>");
let result = normalized_str_impl("temp: -42.5", &["num".to_string()]);
assert_eq!(result, "temp: <num>");
let result = normalized_str_impl("val: 1.23e-10", &["num".to_string()]);
assert_eq!(result, "val: <num>");
}
#[test]
fn test_luhn_validation() {
assert!(is_valid_luhn("4111111111111111")); assert!(is_valid_luhn("5500000000000004")); assert!(is_valid_luhn("378282246310005")); assert!(is_valid_luhn("4111-1111-1111-1111")); assert!(is_valid_luhn("4111 1111 1111 1111"));
assert!(!is_valid_luhn("4111111111111112")); assert!(!is_valid_luhn("1234567890")); assert!(!is_valid_luhn("0000000000000000")); }
#[test]
fn test_normalized_credit_card() {
let result = normalized_str_impl(
"Card: 4111111111111111 charged",
&["credit_card".to_string()],
);
assert_eq!(result, "Card: <credit_card> charged");
let result = normalized_str_impl(
"Card: 4111-1111-1111-1111 charged",
&["credit_card".to_string()],
);
assert_eq!(result, "Card: <credit_card> charged");
let result =
normalized_str_impl("Not a card: 4111111111111112", &["credit_card".to_string()]);
assert_eq!(result, "Not a card: 4111111111111112");
}
#[test]
fn test_ssn_validation() {
assert!(is_valid_ssn("123-45-6789"));
assert!(is_valid_ssn("001-01-0001")); assert!(is_valid_ssn("665-01-0001"));
assert!(!is_valid_ssn("000-12-3456"));
assert!(!is_valid_ssn("666-12-3456"));
assert!(!is_valid_ssn("900-12-3456"));
assert!(!is_valid_ssn("999-99-9999"));
assert!(!is_valid_ssn("123-00-6789"));
assert!(!is_valid_ssn("123-45-0000"));
}
#[test]
fn test_normalized_ssn() {
let result = normalized_str_impl("SSN: 123-45-6789", &["ssn".to_string()]);
assert_eq!(result, "SSN: <ssn>");
let result = normalized_str_impl("Not SSN: 123456789", &["ssn".to_string()]);
assert_eq!(result, "Not SSN: 123456789");
let result = normalized_str_impl("Bad SSN: 999-99-9999", &["ssn".to_string()]);
assert_eq!(result, "Bad SSN: 999-99-9999");
let result = normalized_str_impl("Bad SSN: 000-12-3456", &["ssn".to_string()]);
assert_eq!(result, "Bad SSN: 000-12-3456");
}
#[test]
fn test_phone_validation() {
assert!(is_valid_phone("555-234-5678"));
assert!(is_valid_phone("(212) 555-7890"));
assert!(is_valid_phone("+1-415-555-2671"));
assert!(!is_valid_phone("055-234-5678"));
assert!(!is_valid_phone("155-234-5678"));
assert!(!is_valid_phone("555-012-3456"));
assert!(!is_valid_phone("555-123-4567"));
assert!(!is_valid_phone("555-010-1234"));
}
#[test]
fn test_normalized_phone() {
let result = normalized_str_impl("Call (212) 555-7890", &["phone".to_string()]);
assert_eq!(result, "Call <phone>");
let result = normalized_str_impl("Call 555-234-5678", &["phone".to_string()]);
assert_eq!(result, "Call <phone>");
let result = normalized_str_impl("Call +1-415-555-2671", &["phone".to_string()]);
assert_eq!(result, "Call <phone>");
let result = normalized_str_impl("Call 055-234-5678", &["phone".to_string()]);
assert_eq!(result, "Call 055-234-5678");
let result = normalized_str_impl("Call 555-123-4567", &["phone".to_string()]);
assert_eq!(result, "Call 555-123-4567");
}
#[test]
fn test_normalized_pii_combined() {
let result = normalized_str_impl(
"User SSN 123-45-6789, card 4111111111111111, phone (212) 555-7890",
&[
"ssn".to_string(),
"credit_card".to_string(),
"phone".to_string(),
],
);
assert_eq!(result, "User SSN <ssn>, card <credit_card>, phone <phone>");
}
}