cloakrs_patterns/
email.rs1use crate::common::{compile_regex, confidence, context_boost};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6static EMAIL_REGEX: Lazy<Regex> = Lazy::new(|| {
7 compile_regex(
8 r#"(?:"[^"\r\n]+"|[A-Za-z0-9.!#$%&'*+=?^_`{|}~-]+)@(?:[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?\.)+[A-Za-z]{2,63}"#,
9 )
10});
11
12const CONTEXT_WORDS: &[&str] = &["email:", "e-mail:", "mail:", "contact:", "email", "contact"];
13
14#[derive(Debug, Clone, Copy, Default)]
16pub struct EmailRecognizer;
17
18impl Recognizer for EmailRecognizer {
19 fn id(&self) -> &str {
20 "email_regex_v1"
21 }
22
23 fn entity_type(&self) -> EntityType {
24 EntityType::Email
25 }
26
27 fn supported_locales(&self) -> &[Locale] {
28 &[]
29 }
30
31 fn scan(&self, text: &str) -> Vec<PiiEntity> {
32 EMAIL_REGEX
33 .find_iter(text)
34 .filter(|matched| self.is_valid_match(text, matched.start(), matched.end()))
35 .map(|matched| PiiEntity {
36 entity_type: self.entity_type(),
37 span: Span::new(matched.start(), matched.end()),
38 text: matched.as_str().to_string(),
39 confidence: self.compute_confidence(text, matched.start(), matched.as_str()),
40 recognizer_id: self.id().to_string(),
41 })
42 .collect()
43 }
44
45 fn validate(&self, candidate: &str) -> bool {
46 let Some((local, domain)) = candidate.split_once('@') else {
47 return false;
48 };
49 !local.is_empty()
50 && domain.contains('.')
51 && !domain.starts_with('-')
52 && !domain.ends_with('-')
53 && !domain.contains("..")
54 }
55}
56
57impl EmailRecognizer {
58 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
59 let candidate = &text[start..end];
60 if !self.validate(candidate) {
61 return false;
62 }
63
64 let prefix_start = text[..start]
65 .char_indices()
66 .rev()
67 .find(|(_, c)| c.is_whitespace())
68 .map_or(0, |(idx, c)| idx + c.len_utf8());
69 let prefix = &text[prefix_start..start];
70 if prefix.contains("://") {
71 return false;
72 }
73
74 let before = text[..start].chars().next_back();
75 let after = text[end..].chars().next();
76 !before.is_some_and(|c| c.is_ascii_alphanumeric() || c == '/' || c == '_')
77 && !after.is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
78 }
79
80 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
81 let base: f64 = if candidate.starts_with('"') {
82 0.80
83 } else {
84 0.95
85 };
86 confidence(base + context_boost(text, start, CONTEXT_WORDS))
87 }
88}
89
90#[cfg(test)]
91mod tests {
92 use super::*;
93
94 fn texts(input: &str) -> Vec<String> {
95 EmailRecognizer
96 .scan(input)
97 .into_iter()
98 .map(|finding| finding.text)
99 .collect()
100 }
101
102 #[test]
103 fn test_email_standard_detected() {
104 assert_eq!(texts("Contact user@example.com"), ["user@example.com"]);
105 }
106
107 #[test]
108 fn test_email_plus_tag_detected() {
109 assert_eq!(texts("user+tag@example.com"), ["user+tag@example.com"]);
110 }
111
112 #[test]
113 fn test_email_subdomain_detected() {
114 assert_eq!(
115 texts("Send to user@mail.sub.example.co.uk"),
116 ["user@mail.sub.example.co.uk"]
117 );
118 }
119
120 #[test]
121 fn test_email_quoted_local_detected() {
122 assert_eq!(
123 texts(r#""quoted"@example.com"#),
124 [r#""quoted"@example.com"#]
125 );
126 }
127
128 #[test]
129 fn test_email_multiple_detected() {
130 assert_eq!(
131 texts("a@example.com b@test.org"),
132 ["a@example.com", "b@test.org"]
133 );
134 }
135
136 #[test]
137 fn test_email_at_mention_not_detected() {
138 assert!(texts("@username").is_empty());
139 }
140
141 #[test]
142 fn test_email_inside_url_not_detected() {
143 assert!(texts("https://user@example.com/path").is_empty());
144 }
145
146 #[test]
147 fn test_email_without_tld_not_detected() {
148 assert!(texts("user@example").is_empty());
149 }
150
151 #[test]
152 fn test_email_partial_inside_word_not_detected() {
153 assert!(texts("prefix/user@example.com").is_empty());
154 }
155
156 #[test]
157 fn test_email_context_boosts_confidence() {
158 let with_context = EmailRecognizer.scan("email: user@example.com");
159 let without_context = EmailRecognizer.scan("value user@example.com");
160 assert!(with_context[0].confidence > without_context[0].confidence);
161 }
162
163 #[test]
164 fn test_email_uppercase_domain_detected() {
165 assert_eq!(texts("USER@EXAMPLE.COM"), ["USER@EXAMPLE.COM"]);
166 }
167
168 #[test]
169 fn test_email_apostrophe_local_detected() {
170 assert_eq!(texts("o'hara@example.com"), ["o'hara@example.com"]);
171 }
172
173 #[test]
174 fn test_email_hyphenated_domain_detected() {
175 assert_eq!(
176 texts("user@mail-server.example"),
177 ["user@mail-server.example"]
178 );
179 }
180
181 #[test]
182 fn test_email_dot_local_detected() {
183 assert_eq!(texts("first.last@example.org"), ["first.last@example.org"]);
184 }
185
186 #[test]
187 fn test_email_trailing_punctuation_excluded() {
188 assert_eq!(texts("Send user@example.com."), ["user@example.com"]);
189 }
190
191 #[test]
192 fn test_email_double_dot_domain_not_detected() {
193 assert!(texts("user@example..com").is_empty());
194 }
195
196 #[test]
197 fn test_email_domain_starting_with_hyphen_not_detected() {
198 assert!(texts("user@-example.com").is_empty());
199 }
200
201 #[test]
202 fn test_email_e_mail_context_boosts_confidence() {
203 let with_context = EmailRecognizer.scan("e-mail: user@example.com");
204 let without_context = EmailRecognizer.scan("value user@example.com");
205 assert!(with_context[0].confidence > without_context[0].confidence);
206 }
207
208 #[test]
209 fn test_email_quoted_local_has_lower_confidence_than_standard() {
210 let quoted = EmailRecognizer.scan(r#""quoted"@example.com"#);
211 let standard = EmailRecognizer.scan("user@example.com");
212 assert!(quoted[0].confidence < standard[0].confidence);
213 }
214
215 #[test]
216 fn test_email_validate_rejects_missing_at() {
217 assert!(!EmailRecognizer.validate("user.example.com"));
218 }
219}