cloakrs_patterns/
phone.rs1use crate::common::{compile_regex, confidence, context_boost, digits, is_boundary};
2use crate::credit_card::luhn_valid;
3use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::collections::HashSet;
7
8static INTERNATIONAL_PHONE_REGEX: Lazy<Regex> = Lazy::new(|| {
9 compile_regex(r"\+\d{1,3}[\s.-]?(?:\(\d{2,4}\)|\d{1,4})(?:[\s.-]?\d{2,6}){2,4}\b")
10});
11static NANP_PHONE_REGEX: Lazy<Regex> =
12 Lazy::new(|| compile_regex(r"(?:\(\d{3}\)\s*|\b\d{3}[-. ])\d{3}[-. ]\d{4}\b"));
13
14const CONTEXT_WORDS: &[&str] = &[
15 "call", "phone", "tel:", "tel", "mobile", "cell", "fax", "dial", "text", "sms",
16];
17
18#[derive(Debug, Clone, Copy, Default)]
20pub struct PhoneRecognizer;
21
22impl Recognizer for PhoneRecognizer {
23 fn id(&self) -> &str {
24 "phone_regex_v1"
25 }
26
27 fn entity_type(&self) -> EntityType {
28 EntityType::PhoneNumber
29 }
30
31 fn supported_locales(&self) -> &[Locale] {
32 &[]
33 }
34
35 fn scan(&self, text: &str) -> Vec<PiiEntity> {
36 let mut seen = HashSet::new();
37 let mut findings = Vec::new();
38
39 for regex in [&*INTERNATIONAL_PHONE_REGEX, &*NANP_PHONE_REGEX] {
40 for matched in regex.find_iter(text) {
41 if !findings.iter().any(|finding: &PiiEntity| {
42 matched.start() >= finding.span.start && matched.end() <= finding.span.end
43 }) && seen.insert((matched.start(), matched.end()))
44 && self.is_valid_match(text, matched.start(), matched.end())
45 {
46 findings.push(PiiEntity {
47 entity_type: self.entity_type(),
48 span: Span::new(matched.start(), matched.end()),
49 text: matched.as_str().to_string(),
50 confidence: self.compute_confidence(
51 text,
52 matched.start(),
53 matched.as_str(),
54 ),
55 recognizer_id: self.id().to_string(),
56 });
57 }
58 }
59 }
60
61 findings.sort_by_key(|finding| finding.span.start);
62 findings
63 }
64
65 fn validate(&self, candidate: &str) -> bool {
66 let digits = digits(candidate);
67 if !(7..=15).contains(&digits.len()) {
68 return false;
69 }
70 if digits.chars().all(|c| c == digits.as_bytes()[0] as char) {
71 return false;
72 }
73 if (13..=15).contains(&digits.len()) && luhn_valid(&digits) {
74 return false;
75 }
76 true
77 }
78}
79
80impl PhoneRecognizer {
81 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
82 self.validate(&text[start..end]) && is_boundary(text, start, end)
83 }
84
85 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
86 let base = if candidate.trim_start().starts_with('+') {
87 0.90
88 } else {
89 0.70
90 };
91 confidence(base + context_boost(text, start, CONTEXT_WORDS))
92 }
93}
94
95#[cfg(test)]
96mod tests {
97 use super::*;
98
99 fn texts(input: &str) -> Vec<String> {
100 PhoneRecognizer
101 .scan(input)
102 .into_iter()
103 .map(|finding| finding.text)
104 .collect()
105 }
106
107 #[test]
108 fn test_phone_us_international_detected() {
109 assert_eq!(texts("call +1 (555) 123-4567"), ["+1 (555) 123-4567"]);
110 }
111
112 #[test]
113 fn test_phone_netherlands_mobile_detected() {
114 assert_eq!(texts("+31 6 12345678"), ["+31 6 12345678"]);
115 }
116
117 #[test]
118 fn test_phone_uk_mobile_detected() {
119 assert_eq!(texts("+44 7911 123456"), ["+44 7911 123456"]);
120 }
121
122 #[test]
123 fn test_phone_nanp_dashes_detected() {
124 assert_eq!(texts("555-123-4567"), ["555-123-4567"]);
125 }
126
127 #[test]
128 fn test_phone_nanp_parentheses_detected() {
129 assert_eq!(texts("(555) 123-4567"), ["(555) 123-4567"]);
130 }
131
132 #[test]
133 fn test_phone_year_not_detected() {
134 assert!(texts("2024").is_empty());
135 }
136
137 #[test]
138 fn test_phone_zip_not_detected() {
139 assert!(texts("90210").is_empty());
140 }
141
142 #[test]
143 fn test_phone_credit_card_not_detected() {
144 assert!(texts("4111 1111 1111 1111").is_empty());
145 }
146
147 #[test]
148 fn test_phone_short_sequence_rejected() {
149 assert!(!PhoneRecognizer.validate("123-456"));
150 }
151
152 #[test]
153 fn test_phone_context_boosts_confidence() {
154 let with_context = PhoneRecognizer.scan("phone: 555-123-4567");
155 let without_context = PhoneRecognizer.scan("value 555-123-4567");
156 assert!(with_context[0].confidence > without_context[0].confidence);
157 }
158
159 #[test]
160 fn test_phone_nanp_dots_detected() {
161 assert_eq!(texts("555.123.4567"), ["555.123.4567"]);
162 }
163
164 #[test]
165 fn test_phone_international_dots_detected() {
166 assert_eq!(texts("+1.555.123.4567"), ["+1.555.123.4567"]);
167 }
168
169 #[test]
170 fn test_phone_french_mobile_detected() {
171 assert_eq!(texts("+33 6 12 34 56 78"), ["+33 6 12 34 56 78"]);
172 }
173
174 #[test]
175 fn test_phone_german_number_detected() {
176 assert_eq!(texts("+49 30 1234 5678"), ["+49 30 1234 5678"]);
177 }
178
179 #[test]
180 fn test_phone_two_numbers_detected() {
181 assert_eq!(
182 texts("call 555-123-4567 or +44 7911 123456"),
183 ["555-123-4567", "+44 7911 123456"]
184 );
185 }
186
187 #[test]
188 fn test_phone_seven_digit_local_detected() {
189 assert_eq!(texts("555-1212"), Vec::<String>::new());
190 }
191
192 #[test]
193 fn test_phone_all_same_digits_rejected() {
194 assert!(texts("111-111-1111").is_empty());
195 }
196
197 #[test]
198 fn test_phone_long_sequence_rejected() {
199 assert!(!PhoneRecognizer.validate("+123 4567 8901 2345 6789"));
200 }
201
202 #[test]
203 fn test_phone_embedded_in_word_not_detected() {
204 assert!(texts("id555-123-4567").is_empty());
205 }
206
207 #[test]
208 fn test_phone_trailing_letter_not_detected() {
209 assert!(texts("555-123-4567x").is_empty());
210 }
211
212 #[test]
213 fn test_phone_international_confidence_higher_than_nanp() {
214 let international = PhoneRecognizer.scan("+1 555 123 4567");
215 let nanp = PhoneRecognizer.scan("555-123-4567");
216 assert!(international[0].confidence > nanp[0].confidence);
217 }
218
219 #[test]
220 fn test_phone_tel_context_boosts_confidence() {
221 let with_context = PhoneRecognizer.scan("tel: 555-123-4567");
222 let without_context = PhoneRecognizer.scan("value 555-123-4567");
223 assert!(with_context[0].confidence > without_context[0].confidence);
224 }
225
226 #[test]
227 fn test_phone_mobile_context_boosts_confidence() {
228 let with_context = PhoneRecognizer.scan("mobile +31 6 12345678");
229 let without_context = PhoneRecognizer.scan("value +31 6 12345678");
230 assert!(with_context[0].confidence > without_context[0].confidence);
231 }
232
233 #[test]
234 fn test_phone_plain_random_digits_not_detected() {
235 assert!(texts("1234567890").is_empty());
236 }
237
238 #[test]
239 fn test_phone_date_not_detected() {
240 assert!(texts("2026-05-08").is_empty());
241 }
242
243 #[test]
244 fn test_phone_validate_accepts_minimum_digit_count() {
245 assert!(PhoneRecognizer.validate("123-4567"));
246 }
247
248 #[test]
249 fn test_phone_validate_rejects_six_digits() {
250 assert!(!PhoneRecognizer.validate("123456"));
251 }
252
253 #[test]
254 fn test_phone_validate_rejects_sixteen_digits() {
255 assert!(!PhoneRecognizer.validate("1234567890123456"));
256 }
257
258 #[test]
259 fn test_phone_context_can_reach_full_confidence_cap() {
260 let finding = PhoneRecognizer.scan("call mobile phone +31 6 12345678");
261 assert!(finding[0].confidence.value() <= 1.0);
262 }
263
264 #[test]
265 fn test_phone_fax_context_boosts_confidence() {
266 let with_context = PhoneRecognizer.scan("fax 555-123-4567");
267 let without_context = PhoneRecognizer.scan("value 555-123-4567");
268 assert!(with_context[0].confidence > without_context[0].confidence);
269 }
270}