1use crate::common::{compile_regex, confidence};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7static ISO_DATE_REGEX: Lazy<Regex> = Lazy::new(|| compile_regex(r"\b\d{4}-\d{1,2}-\d{1,2}\b"));
8static SLASH_DASH_DATE_REGEX: Lazy<Regex> =
9 Lazy::new(|| compile_regex(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b"));
10static MONTH_NAME_DATE_REGEX: Lazy<Regex> = Lazy::new(|| {
11 compile_regex(
12 r"(?i)\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\.?\s+\d{1,2},?\s+\d{4}\b",
13 )
14});
15static DAY_MONTH_NAME_DATE_REGEX: Lazy<Regex> = Lazy::new(|| {
16 compile_regex(
17 r"(?i)\b\d{1,2}\s+(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\.?\s+\d{4}\b",
18 )
19});
20
21const CONTEXT_TERMS: &[&str] = &[
22 "dob",
23 "d.o.b",
24 "date of birth",
25 "birth date",
26 "born",
27 "born on",
28 "birthday",
29 "geboortedatum",
30];
31
32#[derive(Debug, Clone, Copy, Default)]
44pub struct DateOfBirthRecognizer;
45
46impl Recognizer for DateOfBirthRecognizer {
47 fn id(&self) -> &str {
48 "date_of_birth_context_v1"
49 }
50
51 fn entity_type(&self) -> EntityType {
52 EntityType::DateOfBirth
53 }
54
55 fn supported_locales(&self) -> &[Locale] {
56 &[]
57 }
58
59 fn scan(&self, text: &str) -> Vec<PiiEntity> {
60 let mut seen = HashSet::new();
61 let mut findings = Vec::new();
62
63 for regex in [
64 &*ISO_DATE_REGEX,
65 &*SLASH_DASH_DATE_REGEX,
66 &*MONTH_NAME_DATE_REGEX,
67 &*DAY_MONTH_NAME_DATE_REGEX,
68 ] {
69 for matched in regex.find_iter(text) {
70 if seen.insert((matched.start(), matched.end()))
71 && self.is_valid_match(text, matched.start(), matched.end())
72 {
73 findings.push(PiiEntity {
74 entity_type: self.entity_type(),
75 span: Span::new(matched.start(), matched.end()),
76 text: matched.as_str().to_string(),
77 confidence: self.compute_confidence(text, matched.start()),
78 recognizer_id: self.id().to_string(),
79 });
80 }
81 }
82 }
83
84 findings.sort_by_key(|finding| finding.span.start);
85 findings
86 }
87
88 fn validate(&self, candidate: &str) -> bool {
89 parse_date(candidate).is_some()
90 }
91}
92
93impl DateOfBirthRecognizer {
94 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
95 self.validate(&text[start..end])
96 && has_birth_context_before(text, start)
97 && is_date_boundary(text, start, end)
98 }
99
100 fn compute_confidence(&self, text: &str, start: usize) -> Confidence {
101 let boost = if has_strong_birth_context_before(text, start) {
102 0.15
103 } else {
104 0.08
105 };
106 confidence(0.76 + boost)
107 }
108}
109
110#[derive(Debug, Clone, Copy)]
111struct DateParts {
112 year: u16,
113 month: u8,
114 day: u8,
115}
116
117fn parse_date(candidate: &str) -> Option<DateParts> {
118 parse_iso_date(candidate)
119 .or_else(|| parse_numeric_date(candidate))
120 .or_else(|| parse_month_name_date(candidate))
121 .filter(|date| is_valid_date(*date))
122}
123
124fn parse_iso_date(candidate: &str) -> Option<DateParts> {
125 let mut parts = candidate.split('-');
126 let year = parts.next()?.parse().ok()?;
127 let month = parts.next()?.parse().ok()?;
128 let day = parts.next()?.parse().ok()?;
129 parts
130 .next()
131 .is_none()
132 .then_some(DateParts { year, month, day })
133}
134
135fn parse_numeric_date(candidate: &str) -> Option<DateParts> {
136 let separator = if candidate.contains('/') { '/' } else { '-' };
137 let values: Vec<u16> = candidate
138 .split(separator)
139 .filter_map(|part| part.parse().ok())
140 .collect();
141 if values.len() != 3 {
142 return None;
143 }
144
145 let first = values[0].try_into().ok()?;
146 let second = values[1].try_into().ok()?;
147 let year = values[2];
148
149 [
150 DateParts {
151 year,
152 month: first,
153 day: second,
154 },
155 DateParts {
156 year,
157 month: second,
158 day: first,
159 },
160 ]
161 .into_iter()
162 .find(|date| is_valid_date(*date))
163}
164
165fn parse_month_name_date(candidate: &str) -> Option<DateParts> {
166 let cleaned = candidate.replace(',', "");
167 let parts: Vec<&str> = cleaned.split_whitespace().collect();
168 let [first, second, year] = parts.as_slice() else {
169 return None;
170 };
171
172 if let Some(month) = month_number(first) {
173 Some(DateParts {
174 year: year.parse().ok()?,
175 month,
176 day: second.parse().ok()?,
177 })
178 } else {
179 Some(DateParts {
180 year: year.parse().ok()?,
181 month: month_number(second)?,
182 day: first.parse().ok()?,
183 })
184 }
185}
186
187fn month_number(value: &str) -> Option<u8> {
188 let lower = value.trim_end_matches('.').to_ascii_lowercase();
189 match lower.as_str() {
190 "jan" | "january" => Some(1),
191 "feb" | "february" => Some(2),
192 "mar" | "march" => Some(3),
193 "apr" | "april" => Some(4),
194 "may" => Some(5),
195 "jun" | "june" => Some(6),
196 "jul" | "july" => Some(7),
197 "aug" | "august" => Some(8),
198 "sep" | "sept" | "september" => Some(9),
199 "oct" | "october" => Some(10),
200 "nov" | "november" => Some(11),
201 "dec" | "december" => Some(12),
202 _ => None,
203 }
204}
205
206fn is_valid_date(date: DateParts) -> bool {
207 (1900..=2100).contains(&date.year)
208 && (1..=12).contains(&date.month)
209 && (1..=days_in_month(date.year, date.month)).contains(&date.day)
210}
211
212fn days_in_month(year: u16, month: u8) -> u8 {
213 match month {
214 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
215 4 | 6 | 9 | 11 => 30,
216 2 if is_leap_year(year) => 29,
217 2 => 28,
218 _ => 0,
219 }
220}
221
222fn is_leap_year(year: u16) -> bool {
223 (year % 4 == 0 && year % 100 != 0) || year % 400 == 0
224}
225
226fn has_birth_context_before(text: &str, start: usize) -> bool {
227 let window = context_window_before(text, start);
228 CONTEXT_TERMS.iter().any(|term| window.contains(term))
229}
230
231fn has_strong_birth_context_before(text: &str, start: usize) -> bool {
232 let window = context_window_before(text, start);
233 ["dob", "date of birth", "d.o.b", "geboortedatum"]
234 .iter()
235 .any(|term| window.contains(term))
236}
237
238fn context_window_before(text: &str, start: usize) -> String {
239 text[..start]
240 .chars()
241 .rev()
242 .take(80)
243 .collect::<String>()
244 .chars()
245 .rev()
246 .collect::<String>()
247 .to_ascii_lowercase()
248}
249
250fn is_date_boundary(text: &str, start: usize, end: usize) -> bool {
251 let before = text[..start].chars().next_back();
252 let after = text[end..].chars().next();
253 !before.is_some_and(is_date_continuation) && !after.is_some_and(is_date_continuation)
254}
255
256fn is_date_continuation(c: char) -> bool {
257 c.is_ascii_alphanumeric() || matches!(c, '/' | '-' | '.')
258}
259
260#[cfg(test)]
261mod tests {
262 use super::*;
263 use crate::default_registry;
264
265 fn texts(input: &str) -> Vec<String> {
266 DateOfBirthRecognizer
267 .scan(input)
268 .into_iter()
269 .map(|finding| finding.text)
270 .collect()
271 }
272
273 #[test]
274 fn test_date_of_birth_iso_with_dob_context_detected() {
275 assert_eq!(texts("DOB: 1980-04-23"), ["1980-04-23"]);
276 }
277
278 #[test]
279 fn test_date_of_birth_slash_mdy_with_context_detected() {
280 assert_eq!(texts("date of birth 04/23/1980"), ["04/23/1980"]);
281 }
282
283 #[test]
284 fn test_date_of_birth_slash_dmy_with_context_detected() {
285 assert_eq!(texts("born 23/04/1980"), ["23/04/1980"]);
286 }
287
288 #[test]
289 fn test_date_of_birth_dash_numeric_with_context_detected() {
290 assert_eq!(texts("birthday 23-04-1980"), ["23-04-1980"]);
291 }
292
293 #[test]
294 fn test_date_of_birth_month_name_with_context_detected() {
295 assert_eq!(texts("DOB Apr 23, 1980"), ["Apr 23, 1980"]);
296 }
297
298 #[test]
299 fn test_date_of_birth_day_month_name_with_context_detected() {
300 assert_eq!(texts("born 23 April 1980"), ["23 April 1980"]);
301 }
302
303 #[test]
304 fn test_date_of_birth_geboortedatum_context_detected() {
305 assert_eq!(texts("geboortedatum 1980-04-23"), ["1980-04-23"]);
306 }
307
308 #[test]
309 fn test_date_of_birth_without_birth_context_rejected() {
310 assert!(texts("invoice date 1980-04-23").is_empty());
311 }
312
313 #[test]
314 fn test_date_of_birth_context_after_date_rejected() {
315 assert!(texts("1980-04-23 date of birth").is_empty());
316 }
317
318 #[test]
319 fn test_date_of_birth_invalid_day_rejected() {
320 assert!(texts("DOB 1980-04-31").is_empty());
321 }
322
323 #[test]
324 fn test_date_of_birth_invalid_month_rejected() {
325 assert!(texts("DOB 1980-13-23").is_empty());
326 }
327
328 #[test]
329 fn test_date_of_birth_invalid_february_rejected() {
330 assert!(texts("DOB 1981-02-29").is_empty());
331 }
332
333 #[test]
334 fn test_date_of_birth_leap_day_detected() {
335 assert_eq!(texts("DOB 1980-02-29"), ["1980-02-29"]);
336 }
337
338 #[test]
339 fn test_date_of_birth_embedded_in_word_rejected() {
340 assert!(texts("DOB x1980-04-23").is_empty());
341 }
342
343 #[test]
344 fn test_date_of_birth_embedded_in_longer_date_rejected() {
345 assert!(texts("DOB 1980-04-23-01").is_empty());
346 }
347
348 #[test]
349 fn test_date_of_birth_multiple_values_detected() {
350 assert_eq!(
351 texts("DOB 1980-04-23 and birthday May 5, 1991"),
352 ["1980-04-23", "May 5, 1991"]
353 );
354 }
355
356 #[test]
357 fn test_date_of_birth_strong_context_has_higher_confidence() {
358 let strong = DateOfBirthRecognizer.scan("date of birth 1980-04-23");
359 let weaker = DateOfBirthRecognizer.scan("born 1980-04-23");
360 assert!(strong[0].confidence > weaker[0].confidence);
361 }
362
363 #[test]
364 fn test_date_of_birth_supported_locales_are_universal() {
365 assert!(DateOfBirthRecognizer.supported_locales().is_empty());
366 }
367
368 #[test]
369 fn test_date_of_birth_default_registry_detects_date_of_birth() {
370 let findings = default_registry().scan_all("DOB 1980-04-23");
371
372 assert!(findings
373 .iter()
374 .any(|finding| finding.entity_type == EntityType::DateOfBirth));
375 }
376}