Skip to main content

cloakrs_patterns/
date_of_birth.rs

1use crate::common::{compile_regex, confidence};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static ISO_DATE_REGEX: Lazy<Regex> = Lazy::new(|| compile_regex(r"\b\d{4}-\d{1,2}-\d{1,2}\b"));
8static SLASH_DASH_DATE_REGEX: Lazy<Regex> =
9    Lazy::new(|| compile_regex(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b"));
10static MONTH_NAME_DATE_REGEX: Lazy<Regex> = Lazy::new(|| {
11    compile_regex(
12        r"(?i)\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\.?\s+\d{1,2},?\s+\d{4}\b",
13    )
14});
15static DAY_MONTH_NAME_DATE_REGEX: Lazy<Regex> = Lazy::new(|| {
16    compile_regex(
17        r"(?i)\b\d{1,2}\s+(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\.?\s+\d{4}\b",
18    )
19});
20
21const CONTEXT_TERMS: &[&str] = &[
22    "dob",
23    "d.o.b",
24    "date of birth",
25    "birth date",
26    "born",
27    "born on",
28    "birthday",
29    "geboortedatum",
30];
31
32/// Recognizes dates that are likely dates of birth from nearby birth context.
33///
34/// # Examples
35///
36/// ```
37/// use cloakrs_core::{EntityType, Recognizer};
38/// use cloakrs_patterns::DateOfBirthRecognizer;
39///
40/// let findings = DateOfBirthRecognizer.scan("DOB: 1980-04-23");
41/// assert_eq!(findings[0].entity_type, EntityType::DateOfBirth);
42/// ```
43#[derive(Debug, Clone, Copy, Default)]
44pub struct DateOfBirthRecognizer;
45
46impl Recognizer for DateOfBirthRecognizer {
47    fn id(&self) -> &str {
48        "date_of_birth_context_v1"
49    }
50
51    fn entity_type(&self) -> EntityType {
52        EntityType::DateOfBirth
53    }
54
55    fn supported_locales(&self) -> &[Locale] {
56        &[]
57    }
58
59    fn scan(&self, text: &str) -> Vec<PiiEntity> {
60        let mut seen = HashSet::new();
61        let mut findings = Vec::new();
62
63        for regex in [
64            &*ISO_DATE_REGEX,
65            &*SLASH_DASH_DATE_REGEX,
66            &*MONTH_NAME_DATE_REGEX,
67            &*DAY_MONTH_NAME_DATE_REGEX,
68        ] {
69            for matched in regex.find_iter(text) {
70                if seen.insert((matched.start(), matched.end()))
71                    && self.is_valid_match(text, matched.start(), matched.end())
72                {
73                    findings.push(PiiEntity {
74                        entity_type: self.entity_type(),
75                        span: Span::new(matched.start(), matched.end()),
76                        text: matched.as_str().to_string(),
77                        confidence: self.compute_confidence(text, matched.start()),
78                        recognizer_id: self.id().to_string(),
79                    });
80                }
81            }
82        }
83
84        findings.sort_by_key(|finding| finding.span.start);
85        findings
86    }
87
88    fn validate(&self, candidate: &str) -> bool {
89        parse_date(candidate).is_some()
90    }
91}
92
93impl DateOfBirthRecognizer {
94    fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
95        self.validate(&text[start..end])
96            && has_birth_context_before(text, start)
97            && is_date_boundary(text, start, end)
98    }
99
100    fn compute_confidence(&self, text: &str, start: usize) -> Confidence {
101        let boost = if has_strong_birth_context_before(text, start) {
102            0.15
103        } else {
104            0.08
105        };
106        confidence(0.76 + boost)
107    }
108}
109
110#[derive(Debug, Clone, Copy)]
111struct DateParts {
112    year: u16,
113    month: u8,
114    day: u8,
115}
116
117fn parse_date(candidate: &str) -> Option<DateParts> {
118    parse_iso_date(candidate)
119        .or_else(|| parse_numeric_date(candidate))
120        .or_else(|| parse_month_name_date(candidate))
121        .filter(|date| is_valid_date(*date))
122}
123
124fn parse_iso_date(candidate: &str) -> Option<DateParts> {
125    let mut parts = candidate.split('-');
126    let year = parts.next()?.parse().ok()?;
127    let month = parts.next()?.parse().ok()?;
128    let day = parts.next()?.parse().ok()?;
129    parts
130        .next()
131        .is_none()
132        .then_some(DateParts { year, month, day })
133}
134
135fn parse_numeric_date(candidate: &str) -> Option<DateParts> {
136    let separator = if candidate.contains('/') { '/' } else { '-' };
137    let values: Vec<u16> = candidate
138        .split(separator)
139        .filter_map(|part| part.parse().ok())
140        .collect();
141    if values.len() != 3 {
142        return None;
143    }
144
145    let first = values[0].try_into().ok()?;
146    let second = values[1].try_into().ok()?;
147    let year = values[2];
148
149    [
150        DateParts {
151            year,
152            month: first,
153            day: second,
154        },
155        DateParts {
156            year,
157            month: second,
158            day: first,
159        },
160    ]
161    .into_iter()
162    .find(|date| is_valid_date(*date))
163}
164
165fn parse_month_name_date(candidate: &str) -> Option<DateParts> {
166    let cleaned = candidate.replace(',', "");
167    let parts: Vec<&str> = cleaned.split_whitespace().collect();
168    let [first, second, year] = parts.as_slice() else {
169        return None;
170    };
171
172    if let Some(month) = month_number(first) {
173        Some(DateParts {
174            year: year.parse().ok()?,
175            month,
176            day: second.parse().ok()?,
177        })
178    } else {
179        Some(DateParts {
180            year: year.parse().ok()?,
181            month: month_number(second)?,
182            day: first.parse().ok()?,
183        })
184    }
185}
186
187fn month_number(value: &str) -> Option<u8> {
188    let lower = value.trim_end_matches('.').to_ascii_lowercase();
189    match lower.as_str() {
190        "jan" | "january" => Some(1),
191        "feb" | "february" => Some(2),
192        "mar" | "march" => Some(3),
193        "apr" | "april" => Some(4),
194        "may" => Some(5),
195        "jun" | "june" => Some(6),
196        "jul" | "july" => Some(7),
197        "aug" | "august" => Some(8),
198        "sep" | "sept" | "september" => Some(9),
199        "oct" | "october" => Some(10),
200        "nov" | "november" => Some(11),
201        "dec" | "december" => Some(12),
202        _ => None,
203    }
204}
205
206fn is_valid_date(date: DateParts) -> bool {
207    (1900..=2100).contains(&date.year)
208        && (1..=12).contains(&date.month)
209        && (1..=days_in_month(date.year, date.month)).contains(&date.day)
210}
211
212fn days_in_month(year: u16, month: u8) -> u8 {
213    match month {
214        1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
215        4 | 6 | 9 | 11 => 30,
216        2 if is_leap_year(year) => 29,
217        2 => 28,
218        _ => 0,
219    }
220}
221
222fn is_leap_year(year: u16) -> bool {
223    (year % 4 == 0 && year % 100 != 0) || year % 400 == 0
224}
225
226fn has_birth_context_before(text: &str, start: usize) -> bool {
227    let window = context_window_before(text, start);
228    CONTEXT_TERMS.iter().any(|term| window.contains(term))
229}
230
231fn has_strong_birth_context_before(text: &str, start: usize) -> bool {
232    let window = context_window_before(text, start);
233    ["dob", "date of birth", "d.o.b", "geboortedatum"]
234        .iter()
235        .any(|term| window.contains(term))
236}
237
238fn context_window_before(text: &str, start: usize) -> String {
239    text[..start]
240        .chars()
241        .rev()
242        .take(80)
243        .collect::<String>()
244        .chars()
245        .rev()
246        .collect::<String>()
247        .to_ascii_lowercase()
248}
249
250fn is_date_boundary(text: &str, start: usize, end: usize) -> bool {
251    let before = text[..start].chars().next_back();
252    let after = text[end..].chars().next();
253    !before.is_some_and(is_date_continuation) && !after.is_some_and(is_date_continuation)
254}
255
256fn is_date_continuation(c: char) -> bool {
257    c.is_ascii_alphanumeric() || matches!(c, '/' | '-' | '.')
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263    use crate::default_registry;
264
265    fn texts(input: &str) -> Vec<String> {
266        DateOfBirthRecognizer
267            .scan(input)
268            .into_iter()
269            .map(|finding| finding.text)
270            .collect()
271    }
272
273    #[test]
274    fn test_date_of_birth_iso_with_dob_context_detected() {
275        assert_eq!(texts("DOB: 1980-04-23"), ["1980-04-23"]);
276    }
277
278    #[test]
279    fn test_date_of_birth_slash_mdy_with_context_detected() {
280        assert_eq!(texts("date of birth 04/23/1980"), ["04/23/1980"]);
281    }
282
283    #[test]
284    fn test_date_of_birth_slash_dmy_with_context_detected() {
285        assert_eq!(texts("born 23/04/1980"), ["23/04/1980"]);
286    }
287
288    #[test]
289    fn test_date_of_birth_dash_numeric_with_context_detected() {
290        assert_eq!(texts("birthday 23-04-1980"), ["23-04-1980"]);
291    }
292
293    #[test]
294    fn test_date_of_birth_month_name_with_context_detected() {
295        assert_eq!(texts("DOB Apr 23, 1980"), ["Apr 23, 1980"]);
296    }
297
298    #[test]
299    fn test_date_of_birth_day_month_name_with_context_detected() {
300        assert_eq!(texts("born 23 April 1980"), ["23 April 1980"]);
301    }
302
303    #[test]
304    fn test_date_of_birth_geboortedatum_context_detected() {
305        assert_eq!(texts("geboortedatum 1980-04-23"), ["1980-04-23"]);
306    }
307
308    #[test]
309    fn test_date_of_birth_without_birth_context_rejected() {
310        assert!(texts("invoice date 1980-04-23").is_empty());
311    }
312
313    #[test]
314    fn test_date_of_birth_context_after_date_rejected() {
315        assert!(texts("1980-04-23 date of birth").is_empty());
316    }
317
318    #[test]
319    fn test_date_of_birth_invalid_day_rejected() {
320        assert!(texts("DOB 1980-04-31").is_empty());
321    }
322
323    #[test]
324    fn test_date_of_birth_invalid_month_rejected() {
325        assert!(texts("DOB 1980-13-23").is_empty());
326    }
327
328    #[test]
329    fn test_date_of_birth_invalid_february_rejected() {
330        assert!(texts("DOB 1981-02-29").is_empty());
331    }
332
333    #[test]
334    fn test_date_of_birth_leap_day_detected() {
335        assert_eq!(texts("DOB 1980-02-29"), ["1980-02-29"]);
336    }
337
338    #[test]
339    fn test_date_of_birth_embedded_in_word_rejected() {
340        assert!(texts("DOB x1980-04-23").is_empty());
341    }
342
343    #[test]
344    fn test_date_of_birth_embedded_in_longer_date_rejected() {
345        assert!(texts("DOB 1980-04-23-01").is_empty());
346    }
347
348    #[test]
349    fn test_date_of_birth_multiple_values_detected() {
350        assert_eq!(
351            texts("DOB 1980-04-23 and birthday May 5, 1991"),
352            ["1980-04-23", "May 5, 1991"]
353        );
354    }
355
356    #[test]
357    fn test_date_of_birth_strong_context_has_higher_confidence() {
358        let strong = DateOfBirthRecognizer.scan("date of birth 1980-04-23");
359        let weaker = DateOfBirthRecognizer.scan("born 1980-04-23");
360        assert!(strong[0].confidence > weaker[0].confidence);
361    }
362
363    #[test]
364    fn test_date_of_birth_supported_locales_are_universal() {
365        assert!(DateOfBirthRecognizer.supported_locales().is_empty());
366    }
367
368    #[test]
369    fn test_date_of_birth_default_registry_detects_date_of_birth() {
370        let findings = default_registry().scan_all("DOB 1980-04-23");
371
372        assert!(findings
373            .iter()
374            .any(|finding| finding.entity_type == EntityType::DateOfBirth));
375    }
376}