1use crate::common::{compile_regex, confidence, context_boost, digits, is_boundary};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6static SSN_REGEX: Lazy<Regex> =
7 Lazy::new(|| compile_regex(r"\b(?:\d{3}[- ]\d{2}[- ]\d{4}|\d{9})\b"));
8
9static US_LOCALES: &[Locale] = &[Locale::US];
10
11const CONTEXT_WORDS: &[&str] = &["ssn", "social security", "tax id", "taxpayer"];
12
13#[derive(Debug, Clone, Copy, Default)]
15pub struct SsnRecognizer;
16
17impl Recognizer for SsnRecognizer {
18 fn id(&self) -> &str {
19 "us_ssn_regex_v1"
20 }
21
22 fn entity_type(&self) -> EntityType {
23 EntityType::Ssn
24 }
25
26 fn supported_locales(&self) -> &[Locale] {
27 US_LOCALES
28 }
29
30 fn scan(&self, text: &str) -> Vec<PiiEntity> {
31 SSN_REGEX
32 .find_iter(text)
33 .filter(|matched| self.is_valid_match(text, matched.start(), matched.end()))
34 .map(|matched| PiiEntity {
35 entity_type: self.entity_type(),
36 span: Span::new(matched.start(), matched.end()),
37 text: matched.as_str().to_string(),
38 confidence: self.compute_confidence(text, matched.start(), matched.as_str()),
39 recognizer_id: self.id().to_string(),
40 })
41 .collect()
42 }
43
44 fn validate(&self, candidate: &str) -> bool {
45 let digits = digits(candidate);
46 if digits.len() != 9 {
47 return false;
48 }
49
50 let area = &digits[0..3];
51 let group = &digits[3..5];
52 let serial = &digits[5..9];
53
54 area != "000"
55 && area != "666"
56 && !matches!(area.parse::<u16>(), Ok(900..=999))
57 && group != "00"
58 && serial != "0000"
59 }
60}
61
62impl SsnRecognizer {
63 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
64 self.validate(&text[start..end]) && is_boundary(text, start, end)
65 }
66
67 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
68 let base = if candidate.contains('-') || candidate.contains(' ') {
69 0.85
70 } else {
71 0.50
72 };
73 confidence(base + context_boost(text, start, CONTEXT_WORDS))
74 }
75}
76
77#[cfg(test)]
78mod tests {
79 use super::*;
80
81 fn texts(input: &str) -> Vec<String> {
82 SsnRecognizer
83 .scan(input)
84 .into_iter()
85 .map(|finding| finding.text)
86 .collect()
87 }
88
89 #[test]
90 fn test_ssn_dash_format_detected() {
91 assert_eq!(texts("SSN 123-45-6789"), ["123-45-6789"]);
92 }
93
94 #[test]
95 fn test_ssn_space_format_detected() {
96 assert_eq!(texts("123 45 6789"), ["123 45 6789"]);
97 }
98
99 #[test]
100 fn test_ssn_plain_format_detected() {
101 assert_eq!(texts("123456789"), ["123456789"]);
102 }
103
104 #[test]
105 fn test_ssn_area_000_rejected() {
106 assert!(texts("000-45-6789").is_empty());
107 }
108
109 #[test]
110 fn test_ssn_area_666_rejected() {
111 assert!(texts("666-45-6789").is_empty());
112 }
113
114 #[test]
115 fn test_ssn_area_900_rejected() {
116 assert!(texts("900-45-6789").is_empty());
117 }
118
119 #[test]
120 fn test_ssn_group_00_rejected() {
121 assert!(texts("123-00-6789").is_empty());
122 }
123
124 #[test]
125 fn test_ssn_serial_0000_rejected() {
126 assert!(texts("123-45-0000").is_empty());
127 }
128
129 #[test]
130 fn test_ssn_context_boosts_confidence() {
131 let with_context = SsnRecognizer.scan("ssn 123-45-6789");
132 let without_context = SsnRecognizer.scan("value 123-45-6789");
133 assert!(with_context[0].confidence > without_context[0].confidence);
134 }
135
136 #[test]
137 fn test_ssn_supported_locale_is_us() {
138 assert_eq!(SsnRecognizer.supported_locales(), &[Locale::US]);
139 }
140
141 #[test]
142 fn test_ssn_area_899_detected() {
143 assert_eq!(texts("899-45-6789"), ["899-45-6789"]);
144 }
145
146 #[test]
147 fn test_ssn_area_999_rejected() {
148 assert!(texts("999-45-6789").is_empty());
149 }
150
151 #[test]
152 fn test_ssn_embedded_in_word_not_detected() {
153 assert!(texts("id123-45-6789").is_empty());
154 }
155
156 #[test]
157 fn test_ssn_social_security_context_boosts_confidence() {
158 let with_context = SsnRecognizer.scan("social security 123-45-6789");
159 let without_context = SsnRecognizer.scan("value 123-45-6789");
160 assert!(with_context[0].confidence > without_context[0].confidence);
161 }
162
163 #[test]
164 fn test_ssn_plain_confidence_lower_than_separated() {
165 let plain = SsnRecognizer.scan("123456789");
166 let separated = SsnRecognizer.scan("123-45-6789");
167 assert!(plain[0].confidence < separated[0].confidence);
168 }
169}