Skip to main content

text_processing_rs/taggers/
date.rs

1//! Date tagger.
2//!
3//! Converts spoken date expressions to written form:
4//! - "july twenty fifth two thousand twelve" → "july 25 2012"
5//! - "nineteen eighties" → "1980s"
6//! - "the twenty fifth of july" → "25 july"
7//! - "january first" → "january 1"
8
9use super::cardinal::words_to_number;
10use super::ordinal;
11
12/// Month names for matching
13const MONTHS: [&str; 12] = [
14    "january",
15    "february",
16    "march",
17    "april",
18    "may",
19    "june",
20    "july",
21    "august",
22    "september",
23    "october",
24    "november",
25    "december",
26];
27
28/// Parse spoken date expression to written form.
29pub fn parse(input: &str) -> Option<String> {
30    let original = input.trim();
31    let input_lower = original.to_lowercase();
32
33    // Try quarter pattern first (most specific)
34    if let Some(result) = parse_quarter(&input_lower) {
35        return Some(result);
36    }
37
38    // Try BC/AD years
39    if let Some(result) = parse_bc_year(&input_lower) {
40        return Some(result);
41    }
42
43    // Try decades (nineteen eighties → 1980s)
44    if let Some(result) = parse_decade(&input_lower) {
45        return Some(result);
46    }
47
48    // Try "the Xth of month [year]" pattern
49    if let Some(result) = parse_day_of_month(original, &input_lower) {
50        return Some(result);
51    }
52
53    // Try month + year first (july 2012, july two thousand twelve)
54    // This must come before month_day_year to avoid "two" being parsed as day 2
55    if let Some(result) = parse_month_year(original, &input_lower) {
56        return Some(result);
57    }
58
59    // Try month + day + year patterns (july twenty fifth twenty twelve)
60    if let Some(result) = parse_month_day_year(original, &input_lower) {
61        return Some(result);
62    }
63
64    // Try standalone year patterns
65    if let Some(result) = parse_year(&input_lower) {
66        return Some(result);
67    }
68
69    None
70}
71
72/// Parse quarter expressions like "second quarter of twenty twenty two" → "Q2 2022"
73fn parse_quarter(input: &str) -> Option<String> {
74    let quarters = [
75        ("first quarter of ", "Q1"),
76        ("second quarter of ", "Q2"),
77        ("third quarter of ", "Q3"),
78        ("fourth quarter of ", "Q4"),
79    ];
80
81    for (pattern, q) in &quarters {
82        if input.starts_with(pattern) {
83            let year_part = input.strip_prefix(pattern)?;
84            let year = parse_year_number(year_part)?;
85            return Some(format!("{} {}", q, year));
86        }
87    }
88
89    None
90}
91
92/// Parse BC years like "seven fifty b c" → "750BC"
93fn parse_bc_year(input: &str) -> Option<String> {
94    let suffixes = [" b c", " bc", " a d", " ad"];
95    for suffix in &suffixes {
96        if input.ends_with(suffix) {
97            let num_part = input.strip_suffix(suffix)?;
98            // Try year-style parsing first (seven fifty → 750)
99            // This handles patterns like "seven fifty" as 7*100+50
100            let year =
101                parse_old_year(num_part).or_else(|| words_to_number(num_part).map(|n| n as i64))?;
102            let era = suffix.replace(" ", "").to_uppercase();
103            return Some(format!("{}{}", year, era));
104        }
105    }
106    None
107}
108
109/// Parse old-style year like "seven fifty" → 750, "twelve thirty four" → 1234
110fn parse_old_year(input: &str) -> Option<i64> {
111    let words: Vec<&str> = input.split_whitespace().collect();
112    if words.len() < 2 {
113        return None;
114    }
115
116    // First word is century (ones or tens digit)
117    let century = words_to_number(words[0])? as i64;
118    if century < 1 || century > 99 {
119        return None;
120    }
121
122    // Remaining words are the two-digit year
123    let year_part = words[1..].join(" ");
124    let year_digits = words_to_number(&year_part)? as i64;
125    if year_digits < 0 || year_digits > 99 {
126        return None;
127    }
128
129    Some(century * 100 + year_digits)
130}
131
132/// Parse decades like "nineteen eighties" → "1980s"
133fn parse_decade(input: &str) -> Option<String> {
134    let decades = [
135        ("twenties", 20),
136        ("thirties", 30),
137        ("forties", 40),
138        ("fifties", 50),
139        ("sixties", 60),
140        ("seventies", 70),
141        ("eighties", 80),
142        ("nineties", 90),
143    ];
144
145    for (suffix, decade_val) in &decades {
146        if input.ends_with(suffix) {
147            let prefix = input.strip_suffix(suffix)?.trim();
148            if prefix.is_empty() {
149                // Just "eighties" without century
150                return Some(format!("{}s", decade_val));
151            }
152            // Parse century prefix like "nineteen"
153            let century = parse_century_prefix(prefix)?;
154            return Some(format!("{}{}s", century, decade_val));
155        }
156    }
157
158    None
159}
160
161/// Parse century prefix (e.g., "nineteen" → 19)
162fn parse_century_prefix(input: &str) -> Option<i64> {
163    match input {
164        "ten" => Some(10),
165        "eleven" => Some(11),
166        "twelve" => Some(12),
167        "thirteen" => Some(13),
168        "fourteen" => Some(14),
169        "fifteen" => Some(15),
170        "sixteen" => Some(16),
171        "seventeen" => Some(17),
172        "eighteen" => Some(18),
173        "nineteen" => Some(19),
174        "twenty" => Some(20),
175        "twenty one" => Some(21),
176        _ => None,
177    }
178}
179
180/// Parse "the Xth of month [year]" pattern
181fn parse_day_of_month(original: &str, input: &str) -> Option<String> {
182    if !input.starts_with("the ") {
183        return None;
184    }
185
186    let rest = input.strip_prefix("the ")?;
187
188    // Find " of "
189    let parts: Vec<&str> = rest.splitn(2, " of ").collect();
190    if parts.len() != 2 {
191        return None;
192    }
193
194    let day_part = parts[0];
195    let month_year_part = parts[1];
196
197    // Parse day as ordinal
198    let day = ordinal::parse(day_part)?;
199    // Remove suffix to get number
200    let day_num: String = day.chars().filter(|c| c.is_ascii_digit()).collect();
201
202    // Parse month and optional year
203    let words: Vec<&str> = month_year_part.split_whitespace().collect();
204    let orig_words: Vec<&str> = original.split_whitespace().collect();
205    if words.is_empty() {
206        return None;
207    }
208
209    let _month = find_month(words[0])?;
210    // Find the original month casing
211    let orig_month = find_original_month(orig_words.iter().copied(), words[0]);
212
213    if words.len() == 1 {
214        // Just month
215        return Some(format!("{} {}", day_num, orig_month));
216    }
217
218    // Month + year
219    let year_words = words[1..].join(" ");
220    let year = parse_year_number(&year_words)?;
221    Some(format!("{} {} {}", day_num, orig_month, year))
222}
223
224/// Parse month + day + year patterns
225fn parse_month_day_year(original: &str, input: &str) -> Option<String> {
226    let words: Vec<&str> = input.split_whitespace().collect();
227    let orig_words: Vec<&str> = original.split_whitespace().collect();
228    if words.is_empty() {
229        return None;
230    }
231
232    // First word must be a month
233    let _month = find_month(words[0])?;
234    let orig_month = orig_words.first().copied().unwrap_or(words[0]);
235
236    if words.len() < 2 {
237        return None;
238    }
239
240    // Try to find where day ends and year begins
241    // Day can be ordinal (twenty fifth) or cardinal (thirty)
242    for split_point in 2..=words.len().min(4) {
243        let day_words = words[1..split_point].join(" ");
244
245        // Try ordinal first
246        if let Some(day_str) = ordinal::parse(&day_words) {
247            let day_num: String = day_str.chars().filter(|c| c.is_ascii_digit()).collect();
248
249            if split_point == words.len() {
250                // No year
251                return Some(format!("{} {}", orig_month, day_num));
252            }
253
254            // Try to parse year from remaining words
255            let year_words = words[split_point..].join(" ");
256            if let Some(year) = parse_year_number(&year_words) {
257                return Some(format!("{} {} {}", orig_month, day_num, year));
258            }
259        }
260    }
261
262    // Try cardinal day (june thirty)
263    if words.len() >= 2 {
264        if let Some(day) = words_to_number(words[1]).map(|n| n as i64) {
265            if day >= 1 && day <= 31 {
266                if words.len() == 2 {
267                    return Some(format!("{} {}", orig_month, day));
268                }
269
270                // Try to parse year
271                let year_words = words[2..].join(" ");
272                if let Some(year) = parse_year_number(&year_words) {
273                    return Some(format!("{} {} {}", orig_month, day, year));
274                }
275            }
276        }
277    }
278
279    None
280}
281
282/// Parse month + year (july 2012)
283fn parse_month_year(original: &str, input: &str) -> Option<String> {
284    let words: Vec<&str> = input.split_whitespace().collect();
285    let orig_words: Vec<&str> = original.split_whitespace().collect();
286    if words.len() < 2 {
287        return None;
288    }
289
290    let _month = find_month(words[0])?;
291    let orig_month = orig_words.first().copied().unwrap_or(words[0]);
292    let year_words = words[1..].join(" ");
293    let year = parse_year_number(&year_words)?;
294
295    Some(format!("{} {}", orig_month, year))
296}
297
298/// Parse standalone year patterns
299/// Only matches specific year patterns, not general numbers
300fn parse_year(input: &str) -> Option<String> {
301    let words: Vec<&str> = input.split_whitespace().collect();
302
303    // "two thousand and X" or "one thousand X" patterns are always years
304    if input.starts_with("two thousand") || input.starts_with("one thousand") {
305        return parse_year_number(input).map(|y| y.to_string());
306    }
307
308    // "nineteen X" or "twenty X" pattern - must have exactly 2 words
309    // This prevents "twenty one" from being matched as 2001 (should be cardinal 21)
310    if words.len() == 2 {
311        let century_prefix = words[0];
312        let year_suffix = words[1];
313
314        // For "twenty X", only allow teens (ten-nineteen) as suffix
315        // This allows "twenty twelve" → 2012 but not "twenty one" → 2001
316        if century_prefix == "twenty" {
317            let is_teens = matches!(
318                year_suffix,
319                "ten"
320                    | "eleven"
321                    | "twelve"
322                    | "thirteen"
323                    | "fourteen"
324                    | "fifteen"
325                    | "sixteen"
326                    | "seventeen"
327                    | "eighteen"
328                    | "nineteen"
329            );
330            if is_teens {
331                return parse_year_number(input).map(|y| y.to_string());
332            }
333        }
334
335        // For other century prefixes (eleven-nineteen), allow teens and tens
336        let is_year_suffix = matches!(
337            year_suffix,
338            "ten"
339                | "eleven"
340                | "twelve"
341                | "thirteen"
342                | "fourteen"
343                | "fifteen"
344                | "sixteen"
345                | "seventeen"
346                | "eighteen"
347                | "nineteen"
348                | "twenty"
349                | "thirty"
350                | "forty"
351                | "fifty"
352                | "sixty"
353                | "seventy"
354                | "eighty"
355                | "ninety"
356        );
357
358        if is_year_suffix
359            && matches!(
360                century_prefix,
361                "eleven"
362                    | "twelve"
363                    | "thirteen"
364                    | "fourteen"
365                    | "fifteen"
366                    | "sixteen"
367                    | "seventeen"
368                    | "eighteen"
369                    | "nineteen"
370            )
371        {
372            return parse_year_number(input).map(|y| y.to_string());
373        }
374    }
375
376    // "nineteen seventy six" style - 3+ words starting with century prefix
377    if words.len() >= 3 {
378        if matches!(
379            words[0],
380            "eleven"
381                | "twelve"
382                | "thirteen"
383                | "fourteen"
384                | "fifteen"
385                | "sixteen"
386                | "seventeen"
387                | "eighteen"
388                | "nineteen"
389                | "twenty"
390        ) {
391            return parse_year_number(input).map(|y| y.to_string());
392        }
393    }
394
395    None
396}
397
398/// Parse year number from spoken form
399fn parse_year_number(input: &str) -> Option<i64> {
400    let words: Vec<&str> = input.split_whitespace().collect();
401    if words.is_empty() {
402        return None;
403    }
404
405    // Handle "two thousand and X" or "two thousand X"
406    if input.starts_with("two thousand") {
407        let rest = input
408            .strip_prefix("two thousand")?
409            .trim()
410            .trim_start_matches("and ")
411            .trim();
412
413        if rest.is_empty() {
414            return Some(2000);
415        }
416
417        let year_part = words_to_number(rest)? as i64;
418        return Some(2000 + year_part);
419    }
420
421    // Handle "one thousand X" (like "one thousand eight" → 1008)
422    if input.starts_with("one thousand") {
423        let rest = input.strip_prefix("one thousand")?.trim();
424        if rest.is_empty() {
425            return Some(1000);
426        }
427
428        let year_part = words_to_number(rest)? as i64;
429        return Some(1000 + year_part);
430    }
431
432    // Handle "nineteen X" or "twenty X" century patterns
433    // "nineteen seventy six" → 1976
434    // "twenty twelve" → 2012
435    if words.len() >= 2 {
436        let century = match words[0] {
437            "nineteen" => Some(19),
438            "twenty" => Some(20),
439            "eighteen" => Some(18),
440            "seventeen" => Some(17),
441            "sixteen" => Some(16),
442            "fifteen" => Some(15),
443            "fourteen" => Some(14),
444            "thirteen" => Some(13),
445            "twelve" => Some(12),
446            "eleven" => Some(11),
447            _ => None,
448        };
449
450        if let Some(c) = century {
451            let year_part = words[1..].join(" ");
452
453            // Handle "oh X" pattern (nineteen oh five → 1905)
454            if year_part.starts_with("oh ") || year_part.starts_with("o ") {
455                let digit_part = year_part
456                    .strip_prefix("oh ")
457                    .or_else(|| year_part.strip_prefix("o "))?;
458                let digit = words_to_number(digit_part)? as i64;
459                return Some(c * 100 + digit);
460            }
461
462            // Parse the two-digit year part
463            if let Some(yy) = words_to_number(&year_part).map(|n| n as i64) {
464                if yy >= 0 && yy <= 99 {
465                    return Some(c * 100 + yy);
466                }
467            }
468        }
469    }
470
471    // Try parsing as a plain number (for years like 1665)
472    // Only if it looks like a year (3-4 digits)
473    if let Some(num) = words_to_number(input).map(|n| n as i64) {
474        if num >= 100 && num <= 9999 {
475            return Some(num);
476        }
477    }
478
479    None
480}
481
482/// Find month name from input
483fn find_month(word: &str) -> Option<&'static str> {
484    for month in &MONTHS {
485        if word == *month {
486            return Some(month);
487        }
488    }
489    None
490}
491
492/// Find the original casing of a month from the original words
493fn find_original_month<'a, I>(orig_words: I, lower_month: &str) -> String
494where
495    I: Iterator<Item = &'a str>,
496{
497    for word in orig_words {
498        if word.to_lowercase() == lower_month {
499            return word.to_string();
500        }
501    }
502    lower_month.to_string()
503}
504
505#[cfg(test)]
506mod tests {
507    use super::*;
508
509    #[test]
510    fn test_decades() {
511        assert_eq!(parse("nineteen eighties"), Some("1980s".to_string()));
512        assert_eq!(parse("nineteen nineties"), Some("1990s".to_string()));
513    }
514
515    #[test]
516    fn test_years() {
517        assert_eq!(parse("two thousand and twenty"), Some("2020".to_string()));
518        assert_eq!(parse("nineteen ninety four"), Some("1994".to_string()));
519        assert_eq!(parse("twenty twelve"), Some("2012".to_string()));
520    }
521
522    #[test]
523    fn test_month_day() {
524        assert_eq!(parse("january first"), Some("january 1".to_string()));
525        assert_eq!(parse("june thirty"), Some("june 30".to_string()));
526    }
527
528    #[test]
529    fn test_month_day_year() {
530        assert_eq!(
531            parse("july twenty fifth two thousand twelve"),
532            Some("july 25 2012".to_string())
533        );
534    }
535
536    #[test]
537    fn test_day_of_month() {
538        assert_eq!(
539            parse("the fifteenth of january"),
540            Some("15 january".to_string())
541        );
542    }
543
544    #[test]
545    fn test_quarter() {
546        assert_eq!(
547            parse("second quarter of twenty twenty two"),
548            Some("Q2 2022".to_string())
549        );
550    }
551
552    #[test]
553    fn test_bc() {
554        assert_eq!(parse("seven fifty b c"), Some("750BC".to_string()));
555    }
556}