Skip to main content

rosetta_date/
parser.rs

1//! Main parsing orchestrator: combines format-based, heuristic, and NLP parsing.
2
3use crate::datetime::RosettaDateTime;
4use crate::error::{Result, RosettaError};
5use crate::i18n::LanguageData;
6use crate::timezone::{self, TzOffset};
7
8/// Options that control the parsing behaviour.
9#[derive(Debug, Clone)]
10pub struct ParseOptions {
11    /// Explicit format strings to try first (strftime for chrono, `time` syntax
12    /// for the `time` backend).
13    pub formats: Vec<String>,
14    /// Languages to try for NLP parsing.  `None` means auto-detect / try all.
15    pub languages: Option<Vec<&'static LanguageData>>,
16    /// Default timezone when none is found in the input string.
17    pub default_tz: TzOffset,
18    /// Base time for resolving relative expressions.  `None` → use current time.
19    pub base_time: Option<RosettaDateTime>,
20}
21
22impl Default for ParseOptions {
23    fn default() -> Self {
24        Self {
25            formats: vec![],
26            languages: None,
27            default_tz: TzOffset::UTC,
28            base_time: None,
29        }
30    }
31}
32
33// ── Public entry points ───────────────────────────────────────────────
34
35/// Parse a date string with default options.
36pub fn parse(input: &str) -> Result<RosettaDateTime> {
37    parse_with_options(input, &ParseOptions::default())
38}
39
40/// Parse a date string with the given options.
41pub fn parse_with_options(input: &str, options: &ParseOptions) -> Result<RosettaDateTime> {
42    let input = input.trim();
43
44    // 1. Try explicit format strings supplied by the user.
45    for fmt in &options.formats {
46        if let Ok(dt) = parse_with_format(input, fmt, &options.default_tz) {
47            return Ok(dt);
48        }
49    }
50
51    // 2. Try well-known standard formats (ISO 8601, RFC 2822 / 3339, …)
52    if let Ok(dt) = try_standard_formats(input, &options.default_tz) {
53        return Ok(dt);
54    }
55
56    // 3. Try dateparser specific extensive manual formats
57    if let Ok(dt) = crate::dateparser::try_parse(input, &options.default_tz) {
58        return Ok(dt);
59    }
60
61    // 4. Try heuristic common patterns.
62    if let Ok(dt) = try_heuristic_patterns(input, &options.default_tz) {
63        return Ok(dt);
64    }
65
66    // 4. Try NLP / natural-language parsing.
67    let base = options
68        .base_time
69        .clone()
70        .unwrap_or_else(RosettaDateTime::now_utc);
71
72    let lang_refs: Option<Vec<&LanguageData>> = options.languages.clone();
73
74    crate::nlp::parse_nlp(input, &base, lang_refs.as_deref())
75}
76
77// ── Standard formats ──────────────────────────────────────────────────
78
79fn try_standard_formats(input: &str, _default_tz: &TzOffset) -> Result<RosettaDateTime> {
80    #[cfg(feature = "time-backend")]
81    {
82        use time::format_description::well_known::{Iso8601, Rfc2822, Rfc3339};
83
84        // RFC 3339: "2023-10-15T12:30:45+08:00"
85        if let Ok(dt) = time::OffsetDateTime::parse(input, &Rfc3339) {
86            return Ok(RosettaDateTime::Time(dt));
87        }
88        // ISO 8601
89        if let Ok(dt) = time::OffsetDateTime::parse(input, &Iso8601::DEFAULT) {
90            return Ok(RosettaDateTime::Time(dt));
91        }
92        // RFC 2822: "Sun, 15 Oct 2023 12:30:45 +0800"
93        if let Ok(dt) = time::OffsetDateTime::parse(input, &Rfc2822) {
94            return Ok(RosettaDateTime::Time(dt));
95        }
96    }
97
98    #[cfg(all(feature = "chrono-backend", not(feature = "time-backend")))]
99    {
100        // RFC 3339
101        if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(input) {
102            return Ok(RosettaDateTime::Chrono(dt));
103        }
104        // RFC 2822
105        if let Ok(dt) = chrono::DateTime::parse_from_rfc2822(input) {
106            return Ok(RosettaDateTime::Chrono(dt));
107        }
108    }
109
110    Err(RosettaError::ParseError(
111        "No standard format matched".into(),
112    ))
113}
114
115// ── Heuristic patterns ────────────────────────────────────────────────
116
117/// A curated list of common date/time patterns to try.
118fn try_heuristic_patterns(input: &str, default_tz: &TzOffset) -> Result<RosettaDateTime> {
119    // Extract trailing timezone if present
120    let (date_part, offset) =
121        timezone::extract_trailing_timezone(input).unwrap_or((input, *default_tz));
122
123    // Try to parse the date_part with common patterns
124    if let Some(dt) = try_common_date_patterns(date_part, &offset) {
125        return Ok(dt);
126    }
127
128    Err(RosettaError::ParseError(
129        "No heuristic pattern matched".into(),
130    ))
131}
132
133fn try_common_date_patterns(input: &str, offset: &TzOffset) -> Option<RosettaDateTime> {
134    let s = input.trim();
135
136    // Normalise separators: treat '/' and '.' as '-' for initial matching
137    let normalised: String = s
138        .chars()
139        .map(|c| match c {
140            '/' | '.' => '-',
141            _ => c,
142        })
143        .collect();
144
145    // Split into tokens by whitespace – we expect at most a date part and a time part.
146    let parts: Vec<&str> = normalised.split_whitespace().collect();
147
148    let (date_str, time_str) = match parts.len() {
149        1 => {
150            // Could be date-only or dateTimeSeparated
151            if parts[0].contains('T') {
152                let dt_parts: Vec<&str> = parts[0].splitn(2, 'T').collect();
153                (dt_parts[0], dt_parts.get(1).copied())
154            } else {
155                (parts[0], None)
156            }
157        }
158        2 => (parts[0], Some(parts[1])),
159        _ => return None,
160    };
161
162    // Parse date components
163    let date_components: Vec<&str> = date_str.split('-').collect();
164    let (year, month, day) = match date_components.len() {
165        3 => {
166            let a: i32 = date_components[0].parse().ok()?;
167            let b: u32 = date_components[1].parse().ok()?;
168            let c: u32 = date_components[2].parse().ok()?;
169
170            if a > 31 {
171                // YMD
172                (a, b as u8, c as u8)
173            } else if c > 31 {
174                // MDY or DMY with 4-digit year at end
175                if a > 12 {
176                    // DMY
177                    (c as i32, b as u8, a as u8)
178                } else {
179                    // MDY
180                    (c as i32, a as u8, b as u8)
181                }
182            } else {
183                // Ambiguous, assume YMD if first >= 100
184                if a >= 100 {
185                    (a, b as u8, c as u8)
186                } else {
187                    // Assume MDY for 2-digit year
188                    let year = if a < 100 { a + 2000 } else { a };
189                    (year, b as u8, c as u8)
190                }
191            }
192        }
193        _ => return None,
194    };
195
196    // Parse time components
197    let (hour, minute, second) = if let Some(ts) = time_str {
198        let time_parts: Vec<&str> = ts.split(':').collect();
199        match time_parts.len() {
200            2 => {
201                let h: u8 = time_parts[0].parse().ok()?;
202                let m: u8 = time_parts[1].parse().ok()?;
203                (h, m, 0u8)
204            }
205            3 => {
206                let h: u8 = time_parts[0].parse().ok()?;
207                let m: u8 = time_parts[1].parse().ok()?;
208                // Handle fractional seconds: "45.123" → 45
209                let sec_str = time_parts[2].split('.').next()?;
210                let s: u8 = sec_str.parse().ok()?;
211                (h, m, s)
212            }
213            _ => (0, 0, 0),
214        }
215    } else {
216        (0, 0, 0)
217    };
218
219    RosettaDateTime::from_components(year, month, day, hour, minute, second, *offset).ok()
220}
221
222// ── Format-based parsing (user-supplied format) ───────────────────────
223
224fn parse_with_format(input: &str, format: &str, default_tz: &TzOffset) -> Result<RosettaDateTime> {
225    #[cfg(feature = "time-backend")]
226    {
227        let format_desc = time::format_description::parse(format)
228            .map_err(|e| RosettaError::FormatError(e.to_string()))?;
229
230        let tz = time::UtcOffset::from_whole_seconds(default_tz.total_seconds)
231            .map_err(|e| RosettaError::TimezoneError(e.to_string()))?;
232
233        let dt = time::OffsetDateTime::parse(input, &format_desc)
234            .or_else(|_| {
235                time::PrimitiveDateTime::parse(input, &format_desc).map(|p| p.assume_offset(tz))
236            })
237            .or_else(|_| {
238                time::Date::parse(input, &format_desc)
239                    .map(|d| d.with_hms(0, 0, 0).unwrap().assume_offset(tz))
240            })
241            .map_err(|e| RosettaError::ParseError(e.to_string()))?;
242
243        return Ok(RosettaDateTime::Time(dt));
244    }
245
246    #[cfg(all(feature = "chrono-backend", not(feature = "time-backend")))]
247    {
248        use chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime};
249        let fo = FixedOffset::east_opt(default_tz.total_seconds)
250            .ok_or_else(|| RosettaError::TimezoneError("Invalid offset".into()))?;
251
252        let dt = DateTime::parse_from_str(input, format)
253            .or_else(|_| {
254                NaiveDateTime::parse_from_str(input, format)
255                    .map(|nd| DateTime::<FixedOffset>::from_naive_utc_and_offset(nd, fo))
256            })
257            .or_else(|_| {
258                NaiveDate::parse_from_str(input, format).map(|nd| {
259                    DateTime::<FixedOffset>::from_naive_utc_and_offset(
260                        nd.and_hms_opt(0, 0, 0).unwrap(),
261                        fo,
262                    )
263                })
264            })
265            .map_err(|e| RosettaError::ParseError(e.to_string()))?;
266
267        return Ok(RosettaDateTime::Chrono(dt));
268    }
269
270    #[allow(unreachable_code)]
271    Err(RosettaError::ParseError("No backend enabled".into()))
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    #[test]
279    fn test_parse_rfc3339() {
280        let result = parse("2023-10-15T12:30:45+08:00");
281        assert!(result.is_ok(), "Failed: {:?}", result);
282        let dt = result.unwrap();
283        assert_eq!(dt.year(), 2023);
284        assert_eq!(dt.month(), 10);
285        assert_eq!(dt.day(), 15);
286        assert_eq!(dt.hour(), 12);
287    }
288
289    #[test]
290    fn test_parse_date_only() {
291        let result = parse("2023-10-15");
292        assert!(result.is_ok(), "Failed: {:?}", result);
293        let dt = result.unwrap();
294        assert_eq!(dt.year(), 2023);
295        assert_eq!(dt.month(), 10);
296        assert_eq!(dt.day(), 15);
297    }
298
299    #[test]
300    fn test_parse_datetime_space() {
301        let result = parse("2023-10-15 14:30:45");
302        assert!(result.is_ok(), "Failed: {:?}", result);
303        let dt = result.unwrap();
304        assert_eq!(dt.hour(), 14);
305        assert_eq!(dt.minute(), 30);
306    }
307
308    #[test]
309    fn test_parse_slash_format() {
310        let result = parse("2023/10/15 08:00:00");
311        assert!(result.is_ok(), "Failed: {:?}", result);
312        let dt = result.unwrap();
313        assert_eq!(dt.year(), 2023);
314        assert_eq!(dt.month(), 10);
315    }
316
317    #[test]
318    fn test_parse_with_tz_suffix() {
319        let result = parse("2023-10-15 12:30:00 PST");
320        assert!(result.is_ok(), "Failed: {:?}", result);
321        let dt = result.unwrap();
322        assert_eq!(dt.offset_seconds(), -28800);
323    }
324
325    #[cfg(feature = "lang-en")]
326    #[test]
327    fn test_parse_nlp_yesterday() {
328        let result = parse("yesterday");
329        assert!(result.is_ok(), "Failed to parse 'yesterday': {:?}", result);
330    }
331
332    #[cfg(feature = "lang-en")]
333    #[test]
334    fn test_parse_nlp_3_hours_ago() {
335        let result = parse("3 hours ago");
336        assert!(
337            result.is_ok(),
338            "Failed to parse '3 hours ago': {:?}",
339            result
340        );
341    }
342
343    #[cfg(feature = "lang-zh")]
344    #[test]
345    fn test_parse_nlp_chinese() {
346        let result = parse("昨天");
347        assert!(result.is_ok(), "Failed to parse '昨天': {:?}", result);
348    }
349}