Skip to main content

whichtime_sys/parsers/zh/
standard_date.rs

1//! Chinese standard date parser
2//!
3//! Handles Chinese date formats like:
4//! - "2016年9月3号" / "2016年9月3號" (YYYY年M月D号/號)
5//! - "9月3号" (M月D号)
6//! - "二零一六年九月三号" (Chinese numerals)
7//! - "二零一六年,九月三号" (with comma separator)
8
9use crate::components::Component;
10use crate::context::ParsingContext;
11use crate::dictionaries::zh::{fullwidth_to_halfwidth, zh_string_to_number};
12use crate::error::Result;
13use crate::parsers::Parser;
14use crate::results::ParsedResult;
15use chrono::Datelike;
16use fancy_regex::Regex;
17use std::sync::LazyLock;
18
19// Pattern for Chinese date: YYYY年M月D号/號/日
20// Supports both Arabic numerals and Chinese numerals
21// Also supports comma/space between year and month
22static PATTERN: LazyLock<Regex> = LazyLock::new(|| {
23    Regex::new(
24        r"(?P<year>[0-90-9一二三四五六七八九十零〇]+)年[,,\s]*(?P<month>[0-90-9一二三四五六七八九十]+)月(?P<day>[0-90-9一二三四五六七八九十]+)(?:号|號|日)"
25    ).unwrap()
26});
27
28// Pattern for month-day only
29static MONTH_DAY_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
30    Regex::new(
31        r"(?P<month>[0-90-9一二三四五六七八九十]+)月(?P<day>[0-90-9一二三四五六七八九十]+)(?:号|號|日)"
32    ).unwrap()
33});
34
35/// Chinese standard date parser
36pub struct ZHStandardDateParser;
37
38impl ZHStandardDateParser {
39    pub fn new() -> Self {
40        Self
41    }
42
43    fn parse_number(s: &str) -> i32 {
44        // First convert full-width to half-width
45        let hankaku = fullwidth_to_halfwidth(s);
46
47        // Try parsing as regular number
48        if let Ok(n) = hankaku.parse::<i32>() {
49            return n;
50        }
51
52        // Try as Chinese numerals
53        zh_string_to_number(s) as i32
54    }
55
56    fn parse_year(s: &str) -> i32 {
57        // For years like "二零一六", we need special handling
58        // It's written digit by digit, not like "二千零一十六"
59        let hankaku = fullwidth_to_halfwidth(s);
60
61        // Try parsing as regular number first
62        if let Ok(n) = hankaku.parse::<i32>() {
63            // Handle 2-digit years
64            if n < 100 {
65                return if n > 50 { 1900 + n } else { 2000 + n };
66            }
67            return n;
68        }
69
70        // For Chinese numerals written digit-by-digit (二零一六)
71        // We need to handle each character
72        let mut result = 0i32;
73        let mut has_chinese = false;
74
75        for c in s.chars() {
76            let c_str = c.to_string();
77            let digit = match c_str.as_str() {
78                "零" | "〇" => {
79                    has_chinese = true;
80                    0
81                }
82                "一" => {
83                    has_chinese = true;
84                    1
85                }
86                "二" => {
87                    has_chinese = true;
88                    2
89                }
90                "三" => {
91                    has_chinese = true;
92                    3
93                }
94                "四" => {
95                    has_chinese = true;
96                    4
97                }
98                "五" => {
99                    has_chinese = true;
100                    5
101                }
102                "六" => {
103                    has_chinese = true;
104                    6
105                }
106                "七" => {
107                    has_chinese = true;
108                    7
109                }
110                "八" => {
111                    has_chinese = true;
112                    8
113                }
114                "九" => {
115                    has_chinese = true;
116                    9
117                }
118                _ => continue,
119            };
120            result = result * 10 + digit;
121        }
122
123        if has_chinese && result > 0 {
124            // Handle 2-digit years
125            if result < 100 {
126                return if result > 50 {
127                    1900 + result
128                } else {
129                    2000 + result
130                };
131            }
132            return result;
133        }
134
135        // Fallback to standard conversion
136        zh_string_to_number(s) as i32
137    }
138
139    fn is_valid_date(year: i32, month: i32, day: i32) -> bool {
140        if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
141            return false;
142        }
143        let days_in_month = match month {
144            1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
145            4 | 6 | 9 | 11 => 30,
146            2 => {
147                if (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0) {
148                    29
149                } else {
150                    28
151                }
152            }
153            _ => return false,
154        };
155        day <= days_in_month
156    }
157}
158
159impl Parser for ZHStandardDateParser {
160    fn name(&self) -> &'static str {
161        "ZHStandardDateParser"
162    }
163
164    fn should_apply(&self, context: &ParsingContext) -> bool {
165        context.text.contains('月')
166            && (context.text.contains('号')
167                || context.text.contains('號')
168                || context.text.contains('日'))
169    }
170
171    fn parse(&self, context: &ParsingContext) -> Result<Vec<ParsedResult>> {
172        let mut results = Vec::new();
173        let ref_date = context.reference.instant;
174
175        let mut start = 0;
176        while start < context.text.len() {
177            let search_text = &context.text[start..];
178
179            // Try full pattern (with year) first
180            if let Ok(Some(caps)) = PATTERN.captures(search_text) {
181                let full_match = caps.get(0).unwrap();
182                let match_start = start + full_match.start();
183                let match_end = start + full_match.end();
184
185                let year = caps
186                    .name("year")
187                    .map(|m| Self::parse_year(m.as_str()))
188                    .unwrap_or(0);
189                let month = caps
190                    .name("month")
191                    .map(|m| Self::parse_number(m.as_str()))
192                    .unwrap_or(0);
193                let day = caps
194                    .name("day")
195                    .map(|m| Self::parse_number(m.as_str()))
196                    .unwrap_or(0);
197
198                if Self::is_valid_date(year, month, day) {
199                    let mut components = context.create_components();
200                    components.assign(Component::Year, year);
201                    components.assign(Component::Month, month);
202                    components.assign(Component::Day, day);
203
204                    results.push(context.create_result(match_start, match_end, components, None));
205                    start = match_end;
206                    continue;
207                }
208            }
209
210            // Try month-day pattern
211            if let Ok(Some(caps)) = MONTH_DAY_PATTERN.captures(search_text) {
212                let full_match = caps.get(0).unwrap();
213                let match_start = start + full_match.start();
214                let match_end = start + full_match.end();
215
216                let month = caps
217                    .name("month")
218                    .map(|m| Self::parse_number(m.as_str()))
219                    .unwrap_or(0);
220                let day = caps
221                    .name("day")
222                    .map(|m| Self::parse_number(m.as_str()))
223                    .unwrap_or(0);
224                let year = ref_date.year();
225
226                if Self::is_valid_date(year, month, day) {
227                    let mut components = context.create_components();
228                    components.imply(Component::Year, year);
229                    components.assign(Component::Month, month);
230                    components.assign(Component::Day, day);
231
232                    results.push(context.create_result(match_start, match_end, components, None));
233                    start = match_end;
234                    continue;
235                }
236            }
237
238            // No match - advance
239            if let Some(c) = search_text.chars().next() {
240                start += c.len_utf8();
241            } else {
242                break;
243            }
244        }
245
246        Ok(results)
247    }
248}
249
250impl Default for ZHStandardDateParser {
251    fn default() -> Self {
252        Self::new()
253    }
254}