Skip to main content

cloakrs_patterns/
email.rs

1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6static EMAIL_REGEX: Lazy<Regex> = Lazy::new(|| {
7    compile_regex(
8        r#"(?:"[^"\r\n]+"|[A-Za-z0-9.!#$%&'*+=?^_`{|}~-]+)@(?:[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?\.)+[A-Za-z]{2,63}"#,
9    )
10});
11
12const CONTEXT_WORDS: &[&str] = &["email:", "e-mail:", "mail:", "contact:", "email", "contact"];
13
14/// Recognizes common email addresses.
15#[derive(Debug, Clone, Copy, Default)]
16pub struct EmailRecognizer;
17
18impl Recognizer for EmailRecognizer {
19    fn id(&self) -> &str {
20        "email_regex_v1"
21    }
22
23    fn entity_type(&self) -> EntityType {
24        EntityType::Email
25    }
26
27    fn supported_locales(&self) -> &[Locale] {
28        &[]
29    }
30
31    fn scan(&self, text: &str) -> Vec<PiiEntity> {
32        EMAIL_REGEX
33            .find_iter(text)
34            .filter(|matched| self.is_valid_match(text, matched.start(), matched.end()))
35            .map(|matched| PiiEntity {
36                entity_type: self.entity_type(),
37                span: Span::new(matched.start(), matched.end()),
38                text: matched.as_str().to_string(),
39                confidence: self.compute_confidence(text, matched.start(), matched.as_str()),
40                recognizer_id: self.id().to_string(),
41            })
42            .collect()
43    }
44
45    fn validate(&self, candidate: &str) -> bool {
46        let Some((local, domain)) = candidate.split_once('@') else {
47            return false;
48        };
49        !local.is_empty()
50            && domain.contains('.')
51            && !domain.starts_with('-')
52            && !domain.ends_with('-')
53            && !domain.contains("..")
54    }
55}
56
57impl EmailRecognizer {
58    fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
59        let candidate = &text[start..end];
60        if !self.validate(candidate) {
61            return false;
62        }
63
64        let prefix_start = text[..start]
65            .char_indices()
66            .rev()
67            .find(|(_, c)| c.is_whitespace())
68            .map_or(0, |(idx, c)| idx + c.len_utf8());
69        let prefix = &text[prefix_start..start];
70        if prefix.contains("://") {
71            return false;
72        }
73
74        let before = text[..start].chars().next_back();
75        let after = text[end..].chars().next();
76        !before.is_some_and(|c| c.is_ascii_alphanumeric() || c == '/' || c == '_')
77            && !after.is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
78    }
79
80    fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
81        let base: f64 = if candidate.starts_with('"') {
82            0.80
83        } else {
84            0.95
85        };
86        confidence(base + context_boost(text, start, CONTEXT_WORDS))
87    }
88}
89
90#[cfg(test)]
91mod tests {
92    use super::*;
93
94    fn texts(input: &str) -> Vec<String> {
95        EmailRecognizer
96            .scan(input)
97            .into_iter()
98            .map(|finding| finding.text)
99            .collect()
100    }
101
102    #[test]
103    fn test_email_standard_detected() {
104        assert_eq!(texts("Contact user@example.com"), ["user@example.com"]);
105    }
106
107    #[test]
108    fn test_email_plus_tag_detected() {
109        assert_eq!(texts("user+tag@example.com"), ["user+tag@example.com"]);
110    }
111
112    #[test]
113    fn test_email_subdomain_detected() {
114        assert_eq!(
115            texts("Send to user@mail.sub.example.co.uk"),
116            ["user@mail.sub.example.co.uk"]
117        );
118    }
119
120    #[test]
121    fn test_email_quoted_local_detected() {
122        assert_eq!(
123            texts(r#""quoted"@example.com"#),
124            [r#""quoted"@example.com"#]
125        );
126    }
127
128    #[test]
129    fn test_email_multiple_detected() {
130        assert_eq!(
131            texts("a@example.com b@test.org"),
132            ["a@example.com", "b@test.org"]
133        );
134    }
135
136    #[test]
137    fn test_email_at_mention_not_detected() {
138        assert!(texts("@username").is_empty());
139    }
140
141    #[test]
142    fn test_email_inside_url_not_detected() {
143        assert!(texts("https://user@example.com/path").is_empty());
144    }
145
146    #[test]
147    fn test_email_without_tld_not_detected() {
148        assert!(texts("user@example").is_empty());
149    }
150
151    #[test]
152    fn test_email_partial_inside_word_not_detected() {
153        assert!(texts("prefix/user@example.com").is_empty());
154    }
155
156    #[test]
157    fn test_email_context_boosts_confidence() {
158        let with_context = EmailRecognizer.scan("email: user@example.com");
159        let without_context = EmailRecognizer.scan("value user@example.com");
160        assert!(with_context[0].confidence > without_context[0].confidence);
161    }
162
163    #[test]
164    fn test_email_uppercase_domain_detected() {
165        assert_eq!(texts("USER@EXAMPLE.COM"), ["USER@EXAMPLE.COM"]);
166    }
167
168    #[test]
169    fn test_email_apostrophe_local_detected() {
170        assert_eq!(texts("o'hara@example.com"), ["o'hara@example.com"]);
171    }
172
173    #[test]
174    fn test_email_hyphenated_domain_detected() {
175        assert_eq!(
176            texts("user@mail-server.example"),
177            ["user@mail-server.example"]
178        );
179    }
180
181    #[test]
182    fn test_email_dot_local_detected() {
183        assert_eq!(texts("first.last@example.org"), ["first.last@example.org"]);
184    }
185
186    #[test]
187    fn test_email_trailing_punctuation_excluded() {
188        assert_eq!(texts("Send user@example.com."), ["user@example.com"]);
189    }
190
191    #[test]
192    fn test_email_double_dot_domain_not_detected() {
193        assert!(texts("user@example..com").is_empty());
194    }
195
196    #[test]
197    fn test_email_domain_starting_with_hyphen_not_detected() {
198        assert!(texts("user@-example.com").is_empty());
199    }
200
201    #[test]
202    fn test_email_e_mail_context_boosts_confidence() {
203        let with_context = EmailRecognizer.scan("e-mail: user@example.com");
204        let without_context = EmailRecognizer.scan("value user@example.com");
205        assert!(with_context[0].confidence > without_context[0].confidence);
206    }
207
208    #[test]
209    fn test_email_quoted_local_has_lower_confidence_than_standard() {
210        let quoted = EmailRecognizer.scan(r#""quoted"@example.com"#);
211        let standard = EmailRecognizer.scan("user@example.com");
212        assert!(quoted[0].confidence < standard[0].confidence);
213    }
214
215    #[test]
216    fn test_email_validate_rejects_missing_at() {
217        assert!(!EmailRecognizer.validate("user.example.com"));
218    }
219}