Skip to main content

whichtime_sys/parsers/common/
month_name.rs

1//! Multi-locale month name parser: "15 January 2024", "15. Januar 2024", etc.
2//!
3//! Handles date expressions with month names across all supported locales.
4//! European locales use little endian format (day month year).
5
6use crate::components::Component;
7use crate::context::ParsingContext;
8use crate::dictionaries::Locale;
9use crate::error::Result;
10use crate::parsers::Parser;
11use crate::results::ParsedResult;
12use crate::scanner::TokenType;
13use chrono::Datelike;
14use regex::Regex;
15use std::sync::LazyLock;
16
17// Locale-specific patterns (little endian: day month year)
18static EN_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
19    Regex::new(r"(?i)(?:^|\W)(\d{1,2})(?:st|nd|rd|th)?\s+(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|june?|july?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s*,?\s*(\d{2,4})?").unwrap()
20});
21
22static DE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
23    Regex::new(r"(?i)(?:^|\W)(\d{1,2})\.?\s+(januar|jänner|janner|jan|februar|feber|feb|märz|maerz|mär|mrz|april|apr|mai|juni|jun|juli|jul|august|aug|september|sep|sept|oktober|okt|november|nov|dezember|dez)\s*,?\s*(\d{2,4})?").unwrap()
24});
25
26static ES_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
27    Regex::new(r"(?i)(?:^|\W)(?:el\s+)?(\d{1,2})(?:\s+de)?\s+(enero|ene|febrero|feb|marzo|mar|abril|abr|mayo|may|junio|jun|julio|jul|agosto|ago|septiembre|sep|sept|octubre|oct|noviembre|nov|diciembre|dic)(?:\s+(?:de(?:l)?\s+)?(\d{2,4}))?").unwrap()
28});
29
30static FR_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
31    Regex::new(r"(?i)(?:^|\W)(?:le\s+)?(\d{1,2})(?:er|ème|eme|e)?\s+(janvier|janv?|février|fevrier|févr?|fevr?|mars|avril|avr|mai|juin|juillet|juil?|août|aout|aou|septembre|sept?|octobre|oct|novembre|nov|décembre|decembre|déc|dec)\s*(\d{2,4})?").unwrap()
32});
33
34static IT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
35    Regex::new(r"(?i)(?:^|\W)(?:il\s+)?(\d{1,2})(?:°)?\s+(gennaio|gen|febbraio|feb|marzo|mar|aprile|apr|maggio|mag|giugno|giu|luglio|lug|agosto|ago|settembre|set|sett|ottobre|ott|novembre|nov|dicembre|dic)\s*(\d{2,4})?").unwrap()
36});
37
38static JA_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
39    Regex::new(r"(\d{2,4})?年?\s*(1|2|3|4|5|6|7|8|9|10|11|12|一|二|三|四|五|六|七|八|九|十|十一|十二)\s*月\s*(\d{1,2}|一|二|三|四|五|六|七|八|九|十|十一|十二|十三|十四|十五|十六|十七|十八|十九|二十|二十一|二十二|二十三|二十四|二十五|二十六|二十七|二十八|二十九|三十|三十一)\s*[日号]").unwrap()
40});
41
42static NL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
43    Regex::new(r"(?i)(?:^|\W)(\d{1,2})(?:e|ste|de)?\s+(januari|jan|februari|feb|maart|mrt|april|apr|mei|juni|jun|juli|jul|augustus|aug|september|sep|sept|oktober|okt|november|nov|december|dec)\s*(\d{2,4})?").unwrap()
44});
45
46static PT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(r"(?i)(?:^|\W)(\d{1,2})(?:\s+de)?\s+(janeiro|jan|fevereiro|fev|março|marco|mar|abril|abr|maio|mai|junho|jun|julho|jul|agosto|ago|setembro|set|outubro|out|novembro|nov|dezembro|dez)(?:\s+(?:de\s+)?(\d{2,4}))?").unwrap()
48});
49
50static RU_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
51    Regex::new(r"(?i)(?:^|\W)(\d{1,2})(?:-?[ео]?е?|го)?\s+(января|янв|февраля|фев|марта|мар|апреля|апр|мая|июня|июн|июля|июл|августа|авг|сентября|сен|сент|октября|окт|ноября|ноя|декабря|дек)\s*(\d{2,4})?").unwrap()
52});
53
54static SV_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
55    Regex::new(r"(?i)(?:^|\W)(?:den\s+)?(\d{1,2})(?::?[ae]?)?\s+(januari|jan|februari|feb|mars|april|apr|maj|juni|jun|juli|jul|augusti|aug|september|sep|sept|oktober|okt|november|nov|december|dec)\s*(\d{2,4})?").unwrap()
56});
57
58static UK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
59    Regex::new(r"(?i)(?:^|\W)(\d{1,2})(?:-?[еиого]?)?\s+(січня|січ|лютого|лют|березня|бер|квітня|квіт|травня|трав|червня|черв|липня|лип|серпня|серп|вересня|вер|жовтня|жовт|листопада|лист|грудня|груд)\s*(\d{2,4})?").unwrap()
60});
61
62static ZH_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
63    Regex::new(r"(\d{2,4})?年?\s*(1|2|3|4|5|6|7|8|9|10|11|12|一|二|三|四|五|六|七|八|九|十|十一|十二)\s*月\s*(\d{1,2}|一|二|三|四|五|六|七|八|九|十|十一|十二|十三|十四|十五|十六|十七|十八|十九|二十|二十一|二十二|二十三|二十四|二十五|二十六|二十七|二十八|二十九|三十|三十一)\s*[日号號]").unwrap()
64});
65
66/// Multi-locale month name parser
67pub struct MultiLocaleMonthNameParser {
68    locale: Locale,
69}
70
71impl MultiLocaleMonthNameParser {
72    pub fn new(locale: Locale) -> Self {
73        Self { locale }
74    }
75
76    fn get_pattern(&self) -> &'static Regex {
77        match self.locale {
78            Locale::En => &EN_PATTERN,
79            Locale::De => &DE_PATTERN,
80            Locale::Es => &ES_PATTERN,
81            Locale::Fr => &FR_PATTERN,
82            Locale::It => &IT_PATTERN,
83            Locale::Ja => &JA_PATTERN,
84            Locale::Nl => &NL_PATTERN,
85            Locale::Pt => &PT_PATTERN,
86            Locale::Ru => &RU_PATTERN,
87            Locale::Sv => &SV_PATTERN,
88            Locale::Uk => &UK_PATTERN,
89            Locale::Zh => &ZH_PATTERN,
90        }
91    }
92
93    fn lookup_month(&self, text: &str) -> Option<u32> {
94        let lower = text.to_lowercase();
95        match self.locale {
96            Locale::En => crate::dictionaries::en::get_month(&lower),
97            Locale::De => crate::dictionaries::de::get_month(&lower),
98            Locale::Es => crate::dictionaries::es::get_month(&lower),
99            Locale::Fr => crate::dictionaries::fr::get_month(&lower),
100            Locale::It => crate::dictionaries::it::get_month(&lower),
101            Locale::Ja => crate::dictionaries::ja::get_month(text)
102                .or_else(|| crate::dictionaries::ja::get_month(&lower)),
103            Locale::Nl => crate::dictionaries::nl::get_month(&lower),
104            Locale::Pt => crate::dictionaries::pt::get_month(&lower),
105            Locale::Ru => crate::dictionaries::ru::get_month(&lower),
106            Locale::Sv => crate::dictionaries::sv::get_month(&lower),
107            Locale::Uk => crate::dictionaries::uk::get_month(&lower),
108            Locale::Zh => crate::dictionaries::zh::get_month(text)
109                .or_else(|| crate::dictionaries::zh::get_month(&lower)),
110        }
111    }
112
113    fn parse_day(&self, text: &str) -> Option<i32> {
114        // Try direct numeric parse first
115        if let Ok(n) = text.parse::<i32>() {
116            return Some(n);
117        }
118
119        // Try locale-specific number parsing
120        let num = match self.locale {
121            Locale::Ja => crate::dictionaries::ja::parse_number_pattern(text),
122            Locale::Zh => crate::dictionaries::zh::parse_number_pattern(text),
123            _ => return None,
124        };
125
126        if num > 0.0 { Some(num as i32) } else { None }
127    }
128}
129
130impl Parser for MultiLocaleMonthNameParser {
131    fn name(&self) -> &'static str {
132        "MultiLocaleMonthNameParser"
133    }
134
135    fn should_apply(&self, context: &ParsingContext) -> bool {
136        context.has_token_type(TokenType::Month)
137    }
138
139    fn parse(&self, context: &ParsingContext) -> Result<Vec<ParsedResult>> {
140        let mut results = Vec::new();
141        let pattern = self.get_pattern();
142        let ref_date = context.reference.instant;
143
144        for mat in pattern.find_iter(context.text) {
145            let matched_text = mat.as_str();
146            let index = mat.start();
147
148            let Some(caps) = pattern.captures(matched_text) else {
149                continue;
150            };
151
152            // Extract day, month, year - order depends on locale
153            let (day, month, year_str) = match self.locale {
154                Locale::Ja | Locale::Zh => {
155                    // Year month day order (optional year)
156                    let year_str = caps.get(1).map(|m| m.as_str());
157                    let month_str = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
158                    let day_str = caps.get(3).map(|m| m.as_str()).unwrap_or_default();
159
160                    let month = self
161                        .lookup_month(month_str)
162                        .or_else(|| month_str.parse::<u32>().ok());
163                    let day = self.parse_day(day_str);
164
165                    (day, month, year_str)
166                }
167                _ => {
168                    // Day month year order (European style)
169                    let day_str = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
170                    let month_str = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
171                    let year_str = caps.get(3).map(|m| m.as_str());
172
173                    let day: Option<i32> = day_str.parse().ok();
174                    let month = self.lookup_month(month_str);
175
176                    (day, month, year_str)
177                }
178            };
179
180            let Some(month) = month else {
181                continue;
182            };
183
184            let day = day.unwrap_or(1);
185
186            if !(1..=31).contains(&day) {
187                continue;
188            }
189
190            let year = if let Some(y) = year_str {
191                parse_year(y)
192            } else {
193                // Determine year based on whether month is in future or past
194                let current_month = ref_date.month() as i32;
195                if (month as i32) < current_month {
196                    ref_date.year() + 1
197                } else {
198                    ref_date.year()
199                }
200            };
201
202            let mut components = context.create_components();
203            components.assign(Component::Year, year);
204            components.assign(Component::Month, month as i32);
205            components.assign(Component::Day, day);
206
207            if !components.is_valid_date() {
208                continue;
209            }
210
211            // Find actual text bounds
212            let actual_start = matched_text
213                .find(|c: char| c.is_alphanumeric())
214                .unwrap_or(0);
215            results.push(context.create_result(
216                index + actual_start,
217                index + matched_text.len(),
218                components,
219                None,
220            ));
221        }
222
223        Ok(results)
224    }
225}
226
227fn parse_year(s: &str) -> i32 {
228    let year: i32 = s.parse().unwrap_or(0);
229    if year < 100 {
230        if year > 50 { 1900 + year } else { 2000 + year }
231    } else {
232        year
233    }
234}