text_processing_rs/taggers/
date.rs1use super::cardinal::words_to_number;
10use super::ordinal;
11
12const MONTHS: [&str; 12] = [
14 "january",
15 "february",
16 "march",
17 "april",
18 "may",
19 "june",
20 "july",
21 "august",
22 "september",
23 "october",
24 "november",
25 "december",
26];
27
28pub fn parse(input: &str) -> Option<String> {
30 let original = input.trim();
31 let input_lower = original.to_lowercase();
32
33 if let Some(result) = parse_quarter(&input_lower) {
35 return Some(result);
36 }
37
38 if let Some(result) = parse_bc_year(&input_lower) {
40 return Some(result);
41 }
42
43 if let Some(result) = parse_decade(&input_lower) {
45 return Some(result);
46 }
47
48 if let Some(result) = parse_day_of_month(original, &input_lower) {
50 return Some(result);
51 }
52
53 if let Some(result) = parse_month_year(original, &input_lower) {
56 return Some(result);
57 }
58
59 if let Some(result) = parse_month_day_year(original, &input_lower) {
61 return Some(result);
62 }
63
64 if let Some(result) = parse_year(&input_lower) {
66 return Some(result);
67 }
68
69 None
70}
71
72fn parse_quarter(input: &str) -> Option<String> {
74 let quarters = [
75 ("first quarter of ", "Q1"),
76 ("second quarter of ", "Q2"),
77 ("third quarter of ", "Q3"),
78 ("fourth quarter of ", "Q4"),
79 ];
80
81 for (pattern, q) in &quarters {
82 if input.starts_with(pattern) {
83 let year_part = input.strip_prefix(pattern)?;
84 let year = parse_year_number(year_part)?;
85 return Some(format!("{} {}", q, year));
86 }
87 }
88
89 None
90}
91
92fn parse_bc_year(input: &str) -> Option<String> {
94 let suffixes = [" b c", " bc", " a d", " ad"];
95 for suffix in &suffixes {
96 if input.ends_with(suffix) {
97 let num_part = input.strip_suffix(suffix)?;
98 let year =
101 parse_old_year(num_part).or_else(|| words_to_number(num_part).map(|n| n as i64))?;
102 let era = suffix.replace(" ", "").to_uppercase();
103 return Some(format!("{}{}", year, era));
104 }
105 }
106 None
107}
108
109fn parse_old_year(input: &str) -> Option<i64> {
111 let words: Vec<&str> = input.split_whitespace().collect();
112 if words.len() < 2 {
113 return None;
114 }
115
116 let century = words_to_number(words[0])? as i64;
118 if century < 1 || century > 99 {
119 return None;
120 }
121
122 let year_part = words[1..].join(" ");
124 let year_digits = words_to_number(&year_part)? as i64;
125 if year_digits < 0 || year_digits > 99 {
126 return None;
127 }
128
129 Some(century * 100 + year_digits)
130}
131
132fn parse_decade(input: &str) -> Option<String> {
134 let decades = [
135 ("twenties", 20),
136 ("thirties", 30),
137 ("forties", 40),
138 ("fifties", 50),
139 ("sixties", 60),
140 ("seventies", 70),
141 ("eighties", 80),
142 ("nineties", 90),
143 ];
144
145 for (suffix, decade_val) in &decades {
146 if input.ends_with(suffix) {
147 let prefix = input.strip_suffix(suffix)?.trim();
148 if prefix.is_empty() {
149 return Some(format!("{}s", decade_val));
151 }
152 let century = parse_century_prefix(prefix)?;
154 return Some(format!("{}{}s", century, decade_val));
155 }
156 }
157
158 None
159}
160
161fn parse_century_prefix(input: &str) -> Option<i64> {
163 match input {
164 "ten" => Some(10),
165 "eleven" => Some(11),
166 "twelve" => Some(12),
167 "thirteen" => Some(13),
168 "fourteen" => Some(14),
169 "fifteen" => Some(15),
170 "sixteen" => Some(16),
171 "seventeen" => Some(17),
172 "eighteen" => Some(18),
173 "nineteen" => Some(19),
174 "twenty" => Some(20),
175 "twenty one" => Some(21),
176 _ => None,
177 }
178}
179
180fn parse_day_of_month(original: &str, input: &str) -> Option<String> {
182 if !input.starts_with("the ") {
183 return None;
184 }
185
186 let rest = input.strip_prefix("the ")?;
187
188 let parts: Vec<&str> = rest.splitn(2, " of ").collect();
190 if parts.len() != 2 {
191 return None;
192 }
193
194 let day_part = parts[0];
195 let month_year_part = parts[1];
196
197 let day = ordinal::parse(day_part)?;
199 let day_num: String = day.chars().filter(|c| c.is_ascii_digit()).collect();
201
202 let words: Vec<&str> = month_year_part.split_whitespace().collect();
204 let orig_words: Vec<&str> = original.split_whitespace().collect();
205 if words.is_empty() {
206 return None;
207 }
208
209 let _month = find_month(words[0])?;
210 let orig_month = find_original_month(orig_words.iter().copied(), words[0]);
212
213 if words.len() == 1 {
214 return Some(format!("{} {}", day_num, orig_month));
216 }
217
218 let year_words = words[1..].join(" ");
220 let year = parse_year_number(&year_words)?;
221 Some(format!("{} {} {}", day_num, orig_month, year))
222}
223
224fn parse_month_day_year(original: &str, input: &str) -> Option<String> {
226 let words: Vec<&str> = input.split_whitespace().collect();
227 let orig_words: Vec<&str> = original.split_whitespace().collect();
228 if words.is_empty() {
229 return None;
230 }
231
232 let _month = find_month(words[0])?;
234 let orig_month = orig_words.first().copied().unwrap_or(words[0]);
235
236 if words.len() < 2 {
237 return None;
238 }
239
240 for split_point in 2..=words.len().min(4) {
243 let day_words = words[1..split_point].join(" ");
244
245 if let Some(day_str) = ordinal::parse(&day_words) {
247 let day_num: String = day_str.chars().filter(|c| c.is_ascii_digit()).collect();
248
249 if split_point == words.len() {
250 return Some(format!("{} {}", orig_month, day_num));
252 }
253
254 let year_words = words[split_point..].join(" ");
256 if let Some(year) = parse_year_number(&year_words) {
257 return Some(format!("{} {} {}", orig_month, day_num, year));
258 }
259 }
260 }
261
262 if words.len() >= 2 {
264 if let Some(day) = words_to_number(words[1]).map(|n| n as i64) {
265 if day >= 1 && day <= 31 {
266 if words.len() == 2 {
267 return Some(format!("{} {}", orig_month, day));
268 }
269
270 let year_words = words[2..].join(" ");
272 if let Some(year) = parse_year_number(&year_words) {
273 return Some(format!("{} {} {}", orig_month, day, year));
274 }
275 }
276 }
277 }
278
279 None
280}
281
282fn parse_month_year(original: &str, input: &str) -> Option<String> {
284 let words: Vec<&str> = input.split_whitespace().collect();
285 let orig_words: Vec<&str> = original.split_whitespace().collect();
286 if words.len() < 2 {
287 return None;
288 }
289
290 let _month = find_month(words[0])?;
291 let orig_month = orig_words.first().copied().unwrap_or(words[0]);
292 let year_words = words[1..].join(" ");
293 let year = parse_year_number(&year_words)?;
294
295 Some(format!("{} {}", orig_month, year))
296}
297
298fn parse_year(input: &str) -> Option<String> {
301 let words: Vec<&str> = input.split_whitespace().collect();
302
303 if input.starts_with("two thousand") || input.starts_with("one thousand") {
305 return parse_year_number(input).map(|y| y.to_string());
306 }
307
308 if words.len() == 2 {
311 let century_prefix = words[0];
312 let year_suffix = words[1];
313
314 if century_prefix == "twenty" {
317 let is_teens = matches!(
318 year_suffix,
319 "ten"
320 | "eleven"
321 | "twelve"
322 | "thirteen"
323 | "fourteen"
324 | "fifteen"
325 | "sixteen"
326 | "seventeen"
327 | "eighteen"
328 | "nineteen"
329 );
330 if is_teens {
331 return parse_year_number(input).map(|y| y.to_string());
332 }
333 }
334
335 let is_year_suffix = matches!(
337 year_suffix,
338 "ten"
339 | "eleven"
340 | "twelve"
341 | "thirteen"
342 | "fourteen"
343 | "fifteen"
344 | "sixteen"
345 | "seventeen"
346 | "eighteen"
347 | "nineteen"
348 | "twenty"
349 | "thirty"
350 | "forty"
351 | "fifty"
352 | "sixty"
353 | "seventy"
354 | "eighty"
355 | "ninety"
356 );
357
358 if is_year_suffix
359 && matches!(
360 century_prefix,
361 "eleven"
362 | "twelve"
363 | "thirteen"
364 | "fourteen"
365 | "fifteen"
366 | "sixteen"
367 | "seventeen"
368 | "eighteen"
369 | "nineteen"
370 )
371 {
372 return parse_year_number(input).map(|y| y.to_string());
373 }
374 }
375
376 if words.len() >= 3 {
378 if matches!(
379 words[0],
380 "eleven"
381 | "twelve"
382 | "thirteen"
383 | "fourteen"
384 | "fifteen"
385 | "sixteen"
386 | "seventeen"
387 | "eighteen"
388 | "nineteen"
389 | "twenty"
390 ) {
391 return parse_year_number(input).map(|y| y.to_string());
392 }
393 }
394
395 None
396}
397
398fn parse_year_number(input: &str) -> Option<i64> {
400 let words: Vec<&str> = input.split_whitespace().collect();
401 if words.is_empty() {
402 return None;
403 }
404
405 if input.starts_with("two thousand") {
407 let rest = input
408 .strip_prefix("two thousand")?
409 .trim()
410 .trim_start_matches("and ")
411 .trim();
412
413 if rest.is_empty() {
414 return Some(2000);
415 }
416
417 let year_part = words_to_number(rest)? as i64;
418 return Some(2000 + year_part);
419 }
420
421 if input.starts_with("one thousand") {
423 let rest = input.strip_prefix("one thousand")?.trim();
424 if rest.is_empty() {
425 return Some(1000);
426 }
427
428 let year_part = words_to_number(rest)? as i64;
429 return Some(1000 + year_part);
430 }
431
432 if words.len() >= 2 {
436 let century = match words[0] {
437 "nineteen" => Some(19),
438 "twenty" => Some(20),
439 "eighteen" => Some(18),
440 "seventeen" => Some(17),
441 "sixteen" => Some(16),
442 "fifteen" => Some(15),
443 "fourteen" => Some(14),
444 "thirteen" => Some(13),
445 "twelve" => Some(12),
446 "eleven" => Some(11),
447 _ => None,
448 };
449
450 if let Some(c) = century {
451 let year_part = words[1..].join(" ");
452
453 if year_part.starts_with("oh ") || year_part.starts_with("o ") {
455 let digit_part = year_part
456 .strip_prefix("oh ")
457 .or_else(|| year_part.strip_prefix("o "))?;
458 let digit = words_to_number(digit_part)? as i64;
459 return Some(c * 100 + digit);
460 }
461
462 if let Some(yy) = words_to_number(&year_part).map(|n| n as i64) {
464 if yy >= 0 && yy <= 99 {
465 return Some(c * 100 + yy);
466 }
467 }
468 }
469 }
470
471 if let Some(num) = words_to_number(input).map(|n| n as i64) {
474 if num >= 100 && num <= 9999 {
475 return Some(num);
476 }
477 }
478
479 None
480}
481
482fn find_month(word: &str) -> Option<&'static str> {
484 for month in &MONTHS {
485 if word == *month {
486 return Some(month);
487 }
488 }
489 None
490}
491
492fn find_original_month<'a, I>(orig_words: I, lower_month: &str) -> String
494where
495 I: Iterator<Item = &'a str>,
496{
497 for word in orig_words {
498 if word.to_lowercase() == lower_month {
499 return word.to_string();
500 }
501 }
502 lower_month.to_string()
503}
504
505#[cfg(test)]
506mod tests {
507 use super::*;
508
509 #[test]
510 fn test_decades() {
511 assert_eq!(parse("nineteen eighties"), Some("1980s".to_string()));
512 assert_eq!(parse("nineteen nineties"), Some("1990s".to_string()));
513 }
514
515 #[test]
516 fn test_years() {
517 assert_eq!(parse("two thousand and twenty"), Some("2020".to_string()));
518 assert_eq!(parse("nineteen ninety four"), Some("1994".to_string()));
519 assert_eq!(parse("twenty twelve"), Some("2012".to_string()));
520 }
521
522 #[test]
523 fn test_month_day() {
524 assert_eq!(parse("january first"), Some("january 1".to_string()));
525 assert_eq!(parse("june thirty"), Some("june 30".to_string()));
526 }
527
528 #[test]
529 fn test_month_day_year() {
530 assert_eq!(
531 parse("july twenty fifth two thousand twelve"),
532 Some("july 25 2012".to_string())
533 );
534 }
535
536 #[test]
537 fn test_day_of_month() {
538 assert_eq!(
539 parse("the fifteenth of january"),
540 Some("15 january".to_string())
541 );
542 }
543
544 #[test]
545 fn test_quarter() {
546 assert_eq!(
547 parse("second quarter of twenty twenty two"),
548 Some("Q2 2022".to_string())
549 );
550 }
551
552 #[test]
553 fn test_bc() {
554 assert_eq!(parse("seven fifty b c"), Some("750BC".to_string()));
555 }
556}