Skip to main content

cloakrs_patterns/
ssn.rs

1use crate::common::{compile_regex, confidence, context_boost, digits, is_boundary};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6static SSN_REGEX: Lazy<Regex> =
7    Lazy::new(|| compile_regex(r"\b(?:\d{3}[- ]\d{2}[- ]\d{4}|\d{9})\b"));
8
9static US_LOCALES: &[Locale] = &[Locale::US];
10
11const CONTEXT_WORDS: &[&str] = &["ssn", "social security", "tax id", "taxpayer"];
12
13/// Recognizes United States Social Security Numbers.
14#[derive(Debug, Clone, Copy, Default)]
15pub struct SsnRecognizer;
16
17impl Recognizer for SsnRecognizer {
18    fn id(&self) -> &str {
19        "us_ssn_regex_v1"
20    }
21
22    fn entity_type(&self) -> EntityType {
23        EntityType::Ssn
24    }
25
26    fn supported_locales(&self) -> &[Locale] {
27        US_LOCALES
28    }
29
30    fn scan(&self, text: &str) -> Vec<PiiEntity> {
31        SSN_REGEX
32            .find_iter(text)
33            .filter(|matched| self.is_valid_match(text, matched.start(), matched.end()))
34            .map(|matched| PiiEntity {
35                entity_type: self.entity_type(),
36                span: Span::new(matched.start(), matched.end()),
37                text: matched.as_str().to_string(),
38                confidence: self.compute_confidence(text, matched.start(), matched.as_str()),
39                recognizer_id: self.id().to_string(),
40            })
41            .collect()
42    }
43
44    fn validate(&self, candidate: &str) -> bool {
45        let digits = digits(candidate);
46        if digits.len() != 9 {
47            return false;
48        }
49
50        let area = &digits[0..3];
51        let group = &digits[3..5];
52        let serial = &digits[5..9];
53
54        area != "000"
55            && area != "666"
56            && !matches!(area.parse::<u16>(), Ok(900..=999))
57            && group != "00"
58            && serial != "0000"
59    }
60}
61
62impl SsnRecognizer {
63    fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
64        self.validate(&text[start..end]) && is_boundary(text, start, end)
65    }
66
67    fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
68        let base = if candidate.contains('-') || candidate.contains(' ') {
69            0.85
70        } else {
71            0.50
72        };
73        confidence(base + context_boost(text, start, CONTEXT_WORDS))
74    }
75}
76
77#[cfg(test)]
78mod tests {
79    use super::*;
80
81    fn texts(input: &str) -> Vec<String> {
82        SsnRecognizer
83            .scan(input)
84            .into_iter()
85            .map(|finding| finding.text)
86            .collect()
87    }
88
89    #[test]
90    fn test_ssn_dash_format_detected() {
91        assert_eq!(texts("SSN 123-45-6789"), ["123-45-6789"]);
92    }
93
94    #[test]
95    fn test_ssn_space_format_detected() {
96        assert_eq!(texts("123 45 6789"), ["123 45 6789"]);
97    }
98
99    #[test]
100    fn test_ssn_plain_format_detected() {
101        assert_eq!(texts("123456789"), ["123456789"]);
102    }
103
104    #[test]
105    fn test_ssn_area_000_rejected() {
106        assert!(texts("000-45-6789").is_empty());
107    }
108
109    #[test]
110    fn test_ssn_area_666_rejected() {
111        assert!(texts("666-45-6789").is_empty());
112    }
113
114    #[test]
115    fn test_ssn_area_900_rejected() {
116        assert!(texts("900-45-6789").is_empty());
117    }
118
119    #[test]
120    fn test_ssn_group_00_rejected() {
121        assert!(texts("123-00-6789").is_empty());
122    }
123
124    #[test]
125    fn test_ssn_serial_0000_rejected() {
126        assert!(texts("123-45-0000").is_empty());
127    }
128
129    #[test]
130    fn test_ssn_context_boosts_confidence() {
131        let with_context = SsnRecognizer.scan("ssn 123-45-6789");
132        let without_context = SsnRecognizer.scan("value 123-45-6789");
133        assert!(with_context[0].confidence > without_context[0].confidence);
134    }
135
136    #[test]
137    fn test_ssn_supported_locale_is_us() {
138        assert_eq!(SsnRecognizer.supported_locales(), &[Locale::US]);
139    }
140
141    #[test]
142    fn test_ssn_area_899_detected() {
143        assert_eq!(texts("899-45-6789"), ["899-45-6789"]);
144    }
145
146    #[test]
147    fn test_ssn_area_999_rejected() {
148        assert!(texts("999-45-6789").is_empty());
149    }
150
151    #[test]
152    fn test_ssn_embedded_in_word_not_detected() {
153        assert!(texts("id123-45-6789").is_empty());
154    }
155
156    #[test]
157    fn test_ssn_social_security_context_boosts_confidence() {
158        let with_context = SsnRecognizer.scan("social security 123-45-6789");
159        let without_context = SsnRecognizer.scan("value 123-45-6789");
160        assert!(with_context[0].confidence > without_context[0].confidence);
161    }
162
163    #[test]
164    fn test_ssn_plain_confidence_lower_than_separated() {
165        let plain = SsnRecognizer.scan("123456789");
166        let separated = SsnRecognizer.scan("123-45-6789");
167        assert!(plain[0].confidence < separated[0].confidence);
168    }
169}