Skip to main content

cloakrs_patterns/
phone.rs

1use crate::common::{compile_regex, confidence, context_boost, digits, is_boundary};
2use crate::credit_card::luhn_valid;
3use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::collections::HashSet;
7
8static INTERNATIONAL_PHONE_REGEX: Lazy<Regex> = Lazy::new(|| {
9    compile_regex(r"\+\d{1,3}[\s.-]?(?:\(\d{2,4}\)|\d{1,4})(?:[\s.-]?\d{2,6}){2,4}\b")
10});
11static NANP_PHONE_REGEX: Lazy<Regex> =
12    Lazy::new(|| compile_regex(r"(?:\(\d{3}\)\s*|\b\d{3}[-. ])\d{3}[-. ]\d{4}\b"));
13
14const CONTEXT_WORDS: &[&str] = &[
15    "call", "phone", "tel:", "tel", "mobile", "cell", "fax", "dial", "text", "sms",
16];
17
18/// Recognizes common international and North American phone numbers.
19#[derive(Debug, Clone, Copy, Default)]
20pub struct PhoneRecognizer;
21
22impl Recognizer for PhoneRecognizer {
23    fn id(&self) -> &str {
24        "phone_regex_v1"
25    }
26
27    fn entity_type(&self) -> EntityType {
28        EntityType::PhoneNumber
29    }
30
31    fn supported_locales(&self) -> &[Locale] {
32        &[]
33    }
34
35    fn scan(&self, text: &str) -> Vec<PiiEntity> {
36        let mut seen = HashSet::new();
37        let mut findings = Vec::new();
38
39        for regex in [&*INTERNATIONAL_PHONE_REGEX, &*NANP_PHONE_REGEX] {
40            for matched in regex.find_iter(text) {
41                if !findings.iter().any(|finding: &PiiEntity| {
42                    matched.start() >= finding.span.start && matched.end() <= finding.span.end
43                }) && seen.insert((matched.start(), matched.end()))
44                    && self.is_valid_match(text, matched.start(), matched.end())
45                {
46                    findings.push(PiiEntity {
47                        entity_type: self.entity_type(),
48                        span: Span::new(matched.start(), matched.end()),
49                        text: matched.as_str().to_string(),
50                        confidence: self.compute_confidence(
51                            text,
52                            matched.start(),
53                            matched.as_str(),
54                        ),
55                        recognizer_id: self.id().to_string(),
56                    });
57                }
58            }
59        }
60
61        findings.sort_by_key(|finding| finding.span.start);
62        findings
63    }
64
65    fn validate(&self, candidate: &str) -> bool {
66        let digits = digits(candidate);
67        if !(7..=15).contains(&digits.len()) {
68            return false;
69        }
70        if digits.chars().all(|c| c == digits.as_bytes()[0] as char) {
71            return false;
72        }
73        if (13..=15).contains(&digits.len()) && luhn_valid(&digits) {
74            return false;
75        }
76        true
77    }
78}
79
80impl PhoneRecognizer {
81    fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
82        self.validate(&text[start..end]) && is_boundary(text, start, end)
83    }
84
85    fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
86        let base = if candidate.trim_start().starts_with('+') {
87            0.90
88        } else {
89            0.70
90        };
91        confidence(base + context_boost(text, start, CONTEXT_WORDS))
92    }
93}
94
95#[cfg(test)]
96mod tests {
97    use super::*;
98
99    fn texts(input: &str) -> Vec<String> {
100        PhoneRecognizer
101            .scan(input)
102            .into_iter()
103            .map(|finding| finding.text)
104            .collect()
105    }
106
107    #[test]
108    fn test_phone_us_international_detected() {
109        assert_eq!(texts("call +1 (555) 123-4567"), ["+1 (555) 123-4567"]);
110    }
111
112    #[test]
113    fn test_phone_netherlands_mobile_detected() {
114        assert_eq!(texts("+31 6 12345678"), ["+31 6 12345678"]);
115    }
116
117    #[test]
118    fn test_phone_uk_mobile_detected() {
119        assert_eq!(texts("+44 7911 123456"), ["+44 7911 123456"]);
120    }
121
122    #[test]
123    fn test_phone_nanp_dashes_detected() {
124        assert_eq!(texts("555-123-4567"), ["555-123-4567"]);
125    }
126
127    #[test]
128    fn test_phone_nanp_parentheses_detected() {
129        assert_eq!(texts("(555) 123-4567"), ["(555) 123-4567"]);
130    }
131
132    #[test]
133    fn test_phone_year_not_detected() {
134        assert!(texts("2024").is_empty());
135    }
136
137    #[test]
138    fn test_phone_zip_not_detected() {
139        assert!(texts("90210").is_empty());
140    }
141
142    #[test]
143    fn test_phone_credit_card_not_detected() {
144        assert!(texts("4111 1111 1111 1111").is_empty());
145    }
146
147    #[test]
148    fn test_phone_short_sequence_rejected() {
149        assert!(!PhoneRecognizer.validate("123-456"));
150    }
151
152    #[test]
153    fn test_phone_context_boosts_confidence() {
154        let with_context = PhoneRecognizer.scan("phone: 555-123-4567");
155        let without_context = PhoneRecognizer.scan("value 555-123-4567");
156        assert!(with_context[0].confidence > without_context[0].confidence);
157    }
158
159    #[test]
160    fn test_phone_nanp_dots_detected() {
161        assert_eq!(texts("555.123.4567"), ["555.123.4567"]);
162    }
163
164    #[test]
165    fn test_phone_international_dots_detected() {
166        assert_eq!(texts("+1.555.123.4567"), ["+1.555.123.4567"]);
167    }
168
169    #[test]
170    fn test_phone_french_mobile_detected() {
171        assert_eq!(texts("+33 6 12 34 56 78"), ["+33 6 12 34 56 78"]);
172    }
173
174    #[test]
175    fn test_phone_german_number_detected() {
176        assert_eq!(texts("+49 30 1234 5678"), ["+49 30 1234 5678"]);
177    }
178
179    #[test]
180    fn test_phone_two_numbers_detected() {
181        assert_eq!(
182            texts("call 555-123-4567 or +44 7911 123456"),
183            ["555-123-4567", "+44 7911 123456"]
184        );
185    }
186
187    #[test]
188    fn test_phone_seven_digit_local_detected() {
189        assert_eq!(texts("555-1212"), Vec::<String>::new());
190    }
191
192    #[test]
193    fn test_phone_all_same_digits_rejected() {
194        assert!(texts("111-111-1111").is_empty());
195    }
196
197    #[test]
198    fn test_phone_long_sequence_rejected() {
199        assert!(!PhoneRecognizer.validate("+123 4567 8901 2345 6789"));
200    }
201
202    #[test]
203    fn test_phone_embedded_in_word_not_detected() {
204        assert!(texts("id555-123-4567").is_empty());
205    }
206
207    #[test]
208    fn test_phone_trailing_letter_not_detected() {
209        assert!(texts("555-123-4567x").is_empty());
210    }
211
212    #[test]
213    fn test_phone_international_confidence_higher_than_nanp() {
214        let international = PhoneRecognizer.scan("+1 555 123 4567");
215        let nanp = PhoneRecognizer.scan("555-123-4567");
216        assert!(international[0].confidence > nanp[0].confidence);
217    }
218
219    #[test]
220    fn test_phone_tel_context_boosts_confidence() {
221        let with_context = PhoneRecognizer.scan("tel: 555-123-4567");
222        let without_context = PhoneRecognizer.scan("value 555-123-4567");
223        assert!(with_context[0].confidence > without_context[0].confidence);
224    }
225
226    #[test]
227    fn test_phone_mobile_context_boosts_confidence() {
228        let with_context = PhoneRecognizer.scan("mobile +31 6 12345678");
229        let without_context = PhoneRecognizer.scan("value +31 6 12345678");
230        assert!(with_context[0].confidence > without_context[0].confidence);
231    }
232
233    #[test]
234    fn test_phone_plain_random_digits_not_detected() {
235        assert!(texts("1234567890").is_empty());
236    }
237
238    #[test]
239    fn test_phone_date_not_detected() {
240        assert!(texts("2026-05-08").is_empty());
241    }
242
243    #[test]
244    fn test_phone_validate_accepts_minimum_digit_count() {
245        assert!(PhoneRecognizer.validate("123-4567"));
246    }
247
248    #[test]
249    fn test_phone_validate_rejects_six_digits() {
250        assert!(!PhoneRecognizer.validate("123456"));
251    }
252
253    #[test]
254    fn test_phone_validate_rejects_sixteen_digits() {
255        assert!(!PhoneRecognizer.validate("1234567890123456"));
256    }
257
258    #[test]
259    fn test_phone_context_can_reach_full_confidence_cap() {
260        let finding = PhoneRecognizer.scan("call mobile phone +31 6 12345678");
261        assert!(finding[0].confidence.value() <= 1.0);
262    }
263
264    #[test]
265    fn test_phone_fax_context_boosts_confidence() {
266        let with_context = PhoneRecognizer.scan("fax 555-123-4567");
267        let without_context = PhoneRecognizer.scan("value 555-123-4567");
268        assert!(with_context[0].confidence > without_context[0].confidence);
269    }
270}