use crate::patterns::Token;
use regex::Regex;
use std::sync::LazyLock;
static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"\b[a-zA-Z0-9][a-zA-Z0-9._+-]{0,63}@[a-zA-Z0-9][a-zA-Z0-9.-]{0,253}\.[a-zA-Z]{2,}\b",
)
.unwrap()
});
pub struct EmailPatternDetector {
regex: Regex,
}
impl EmailPatternDetector {
pub fn new() -> Result<Self, regex::Error> {
Ok(Self {
regex: EMAIL_REGEX.clone(),
})
}
pub fn detect_and_replace(&self, text: &str) -> (String, Vec<Token>) {
let mut tokens = Vec::new();
let mut normalized = text.to_string();
let mut matches: Vec<_> = self.regex.find_iter(text).collect();
matches.reverse();
for email_match in matches {
let email = email_match.as_str();
if self.validate_email(email) {
tokens.push(Token::Email(email.to_string()));
normalized.replace_range(email_match.range(), "<EMAIL>");
}
}
tokens.reverse();
(normalized, tokens)
}
pub fn validate_email(&self, candidate: &str) -> bool {
if candidate.len() > 320 {
return false;
}
let at_count = candidate.chars().filter(|&c| c == '@').count();
if at_count != 1 {
return false;
}
let parts: Vec<&str> = candidate.split('@').collect();
if parts.len() != 2 {
return false;
}
let local = parts[0];
let domain = parts[1];
if local.is_empty() || domain.is_empty() {
return false;
}
if !domain.contains('.') {
return false;
}
if domain.starts_with('.') || domain.ends_with('.') {
return false;
}
if local.starts_with('.') || local.ends_with('.') {
return false;
}
true
}
}
impl Default for EmailPatternDetector {
fn default() -> Self {
Self::new().unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_email_detector_creation() {
let detector = EmailPatternDetector::new();
assert!(detector.is_ok(), "Email detector creation should succeed");
}
#[test]
fn test_basic_email_detection() {
let detector = EmailPatternDetector::new().unwrap();
let (normalized, tokens) = detector.detect_and_replace("User test@example.com logged in");
assert_eq!(normalized, "User <EMAIL> logged in");
assert_eq!(tokens.len(), 1);
match &tokens[0] {
Token::Email(email) => assert_eq!(email, "test@example.com"),
_ => panic!("Expected Email token"),
}
}
#[test]
fn test_multiple_emails_detection() {
let detector = EmailPatternDetector::new().unwrap();
let (normalized, tokens) =
detector.detect_and_replace("Forward from alice@company.com to bob@company.com");
assert_eq!(normalized, "Forward from <EMAIL> to <EMAIL>");
assert_eq!(tokens.len(), 2);
match &tokens[0] {
Token::Email(email) => assert_eq!(email, "alice@company.com"),
_ => panic!("Expected Email token"),
}
match &tokens[1] {
Token::Email(email) => assert_eq!(email, "bob@company.com"),
_ => panic!("Expected Email token"),
}
}
#[test]
fn test_no_false_positives() {
let detector = EmailPatternDetector::new().unwrap();
let test_cases = vec![
"@domain.com", "user@", "user@.com", "not-an-email", "user@domain@extra", "user@domain", ];
for case in test_cases {
let (normalized, tokens) = detector.detect_and_replace(case);
assert_eq!(normalized, case, "Should not modify invalid email: {case}");
assert_eq!(
tokens.len(),
0,
"Should not detect tokens for invalid email: {case}"
);
}
}
#[test]
fn test_email_validation() {
let detector = EmailPatternDetector::new().unwrap();
assert!(detector.validate_email("user@domain.com"));
assert!(detector.validate_email("first.last@subdomain.example.org"));
assert!(detector.validate_email("admin+tag@company-name.co.uk"));
assert!(!detector.validate_email(""));
assert!(!detector.validate_email(&"a".repeat(321))); assert!(!detector.validate_email("user@domain@extra")); assert!(!detector.validate_email("@domain.com")); assert!(!detector.validate_email("user@")); assert!(!detector.validate_email("user@.com")); assert!(!detector.validate_email("user@domain.")); assert!(!detector.validate_email(".user@domain.com")); assert!(!detector.validate_email("user.@domain.com")); assert!(!detector.validate_email("user@domain")); }
#[test]
fn test_complex_email_formats() {
let detector = EmailPatternDetector::new().unwrap();
let test_cases = vec![
("user@example.com", true),
("first.last@domain.co.uk", true),
("admin+tag@company.org", true),
("test_user@sub.domain.com", true),
("user123@domain123.net", true),
("user-name@domain-name.info", true),
];
for (email, should_detect) in test_cases {
let (normalized, tokens) = detector.detect_and_replace(&format!("Email: {email}"));
if should_detect {
assert_eq!(normalized, "Email: <EMAIL>");
assert_eq!(tokens.len(), 1);
match &tokens[0] {
Token::Email(detected) => assert_eq!(detected, email),
_ => panic!("Expected Email token for: {email}"),
}
} else {
assert_eq!(normalized, format!("Email: {email}"));
assert_eq!(tokens.len(), 0);
}
}
}
#[test]
fn validate_email_exactly_320_chars() {
let local = "a".repeat(63);
let domain = format!("{}.com", "b".repeat(250));
let email = format!("{local}@{domain}");
assert!(email.len() <= 320);
let detector = EmailPatternDetector::new().unwrap();
assert!(detector.validate_email(&email));
}
#[test]
fn validate_email_321_chars_rejected() {
let local = "a".repeat(64);
let domain_body = "b".repeat(321 - 64 - 1 - 4); let email = format!("{local}@{domain_body}.com");
assert!(
email.len() > 320,
"email len {} should exceed 320",
email.len()
);
let detector = EmailPatternDetector::new().unwrap();
assert!(!detector.validate_email(&email));
}
#[test]
fn validate_email_exactly_320_accepted() {
let local = "a".repeat(50);
let domain_needed = 320 - 50 - 1 - 4; let domain_body = "b".repeat(domain_needed);
let email = format!("{local}@{domain_body}.com");
assert_eq!(email.len(), 320, "email len should be exactly 320");
let detector = EmailPatternDetector::new().unwrap();
assert!(
detector.validate_email(&email),
"320-char email should be accepted"
);
}
#[test]
fn validate_email_exactly_321_rejected() {
let local = "a".repeat(50);
let domain_needed = 321 - 50 - 1 - 4; let domain_body = "b".repeat(domain_needed);
let email = format!("{local}@{domain_body}.com");
assert_eq!(email.len(), 321, "email len should be exactly 321");
let detector = EmailPatternDetector::new().unwrap();
assert!(
!detector.validate_email(&email),
"321-char email should be rejected"
);
}
}