Skip to main content

rosetta_date/
dateparser.rs

1//! Extended format parser inspired by the `dateparser` crate
2//! (<https://github.com/waltzofpearls/dateparser>).
3//!
4//! Handles Unix timestamps, Postgres-style datetimes, slash/dot separated
5//! dates, abbreviated/full month names, AM/PM, MySQL log format, Chinese
6//! date literals, and time-only inputs.
7
8use crate::datetime::RosettaDateTime;
9use crate::error::{Result, RosettaError};
10use crate::timezone::{self, TzOffset};
11
12// ── Public entry point ────────────────────────────────────────────────
13
14pub(crate) fn try_parse(input: &str, default_tz: &TzOffset) -> Result<RosettaDateTime> {
15    let s = input.trim();
16
17    // 1. Unix timestamp (all digits, optional leading '-')
18    if let Some(dt) = try_unix_timestamp(s) {
19        return Ok(dt);
20    }
21
22    // 2. Chinese date literals: 2014年04月08日…
23    if let Some(dt) = try_chinese_format(s, default_tz) {
24        return Ok(dt);
25    }
26
27    // 3. MySQL log: 171113 14:14:20
28    if let Some(dt) = try_mysql_log(s, default_tz) {
29        return Ok(dt);
30    }
31
32    // 4. Postgres-style offset right-glued to time with no space: 08:08-08
33    // (handled inside try_normalized_heuristic via extract_trailing_timezone)
34
35    // 5. General heuristic that normalises the string and dispatches to
36    //    sub-parsers (month names, slash dates, dot dates, AM/PM, time-only…)
37    if let Some(dt) = try_normalized(s, default_tz) {
38        return Ok(dt);
39    }
40
41    Err(RosettaError::ParseError(
42        "No dateparser format matched".into(),
43    ))
44}
45
46// ── Unix timestamps ───────────────────────────────────────────────────
47
48fn try_unix_timestamp(input: &str) -> Option<RosettaDateTime> {
49    // Accept strings of only ASCII digits (no sign for now – negative
50    // timestamps are exotic and conflict with date separators).
51    if input.is_empty() || !input.bytes().all(|b| b.is_ascii_digit()) {
52        return None;
53    }
54
55    let len = input.len();
56    let val: i64 = input.parse().ok()?;
57
58    let seconds = if len <= 10 {
59        // seconds
60        val
61    } else if len <= 13 {
62        // milliseconds
63        val / 1_000
64    } else if len <= 19 {
65        // microseconds or nanoseconds
66        val / 1_000_000_000
67    } else {
68        return None;
69    };
70
71    let base = RosettaDateTime::from_components(1970, 1, 1, 0, 0, 0, TzOffset::UTC).ok()?;
72    Some(base.add_seconds(seconds))
73}
74
75// ── Chinese date literals ─────────────────────────────────────────────
76
77fn try_chinese_format(input: &str, default_tz: &TzOffset) -> Option<RosettaDateTime> {
78    if !input.contains('年') {
79        return None;
80    }
81    // 2014年04月08日11时25分18秒  /  2014年04月08日
82    let s = input
83        .replace(['年', '月'], "-")
84        .replace('日', " ")
85        .replace(['时', '分'], ":")
86        .replace('秒', "");
87
88    parse_ymd_hms(s.trim(), default_tz)
89}
90
91// ── MySQL log format: yymmdd hh:mm:ss ────────────────────────────────
92
93fn try_mysql_log(input: &str, default_tz: &TzOffset) -> Option<RosettaDateTime> {
94    // Exactly "YYMMDD HH:MM:SS" — 6-digit date followed by a space then time
95    let parts: Vec<&str> = input.splitn(2, ' ').collect();
96    if parts.len() != 2 {
97        return None;
98    }
99    let date_part = parts[0];
100    let time_part = parts[1];
101
102    if date_part.len() != 6 || !date_part.bytes().all(|b| b.is_ascii_digit()) {
103        return None;
104    }
105
106    let yy: i32 = date_part[..2].parse().ok()?;
107    let mm: u8 = date_part[2..4].parse().ok()?;
108    let dd: u8 = date_part[4..6].parse().ok()?;
109    let year = if yy < 70 { 2000 + yy } else { 1900 + yy };
110
111    let (h, min, s) = parse_time_str(time_part)?;
112    RosettaDateTime::from_components(year, mm, dd, h, min, s, *default_tz).ok()
113}
114
115// ── General normalised heuristic ──────────────────────────────────────
116
117fn try_normalized(input: &str, default_tz: &TzOffset) -> Option<RosettaDateTime> {
118    // Postgres-style: "2019-11-29 08:08-08" or "2021-05-02 23:31:36.0741-07"
119    // The offset is glued right after digits with no space. Pre-process it by
120    // scanning the time part for a ±NN offset.
121    let postgres_split = split_postgres_offset(input);
122    let (input_for_tz, extra_offset): (&str, Option<TzOffset>) = match &postgres_split {
123        Some((body_str, off)) => (body_str.as_str(), Some(*off)),
124        None => (input, None),
125    };
126
127    // Strip trailing timezone first so the remainder is "clean"
128    let (body, offset) = if let Some(off) = extra_offset {
129        (input_for_tz, off)
130    } else {
131        timezone::extract_trailing_timezone(input_for_tz).unwrap_or((input_for_tz, *default_tz))
132    };
133    let body = body.trim();
134
135    // Detect if first token looks like a month name → month-name-led format
136    let first_tok: &str = body.split_whitespace().next().unwrap_or("");
137    if month_from_name(first_tok).is_some() {
138        return try_month_name_first(body, &offset);
139    }
140
141    // Detect if first token is a 1-2 digit day followed by a month name
142    // e.g. "12 Feb 2006" / "7 oct 70"
143    let toks: Vec<&str> = body.split_whitespace().collect();
144    if toks.len() >= 2 {
145        let maybe_day: bool = toks[0].parse::<u8>().is_ok();
146        let maybe_month: bool = month_from_name(toks[1]).is_some();
147        if maybe_day && maybe_month {
148            return try_day_month_year(body, &offset);
149        }
150    }
151
152    // Time-only: "01:06:06", "4:00pm"
153    if looks_like_time_only(body) {
154        return try_time_only(body, default_tz, &offset);
155    }
156
157    // Normalise "/" and "." separators to "-" then try standard-ish parsing
158    let normalised = normalise_separators(body);
159    parse_ymd_hms(normalised.trim(), &offset)
160}
161
162// ── Month-name-first formats ──────────────────────────────────────────
163// "May 6 at 9:24 PM", "May 8, 2009 5:57:51 PM", "September 17, 2012, 10:10:09"
164// "May 25, 2021", "oct 7, 1970", "2021-Feb-21" (handled in parse_ymd_hms via
165//  normalise_separators → month name in middle position)
166
167fn try_month_name_first(body: &str, offset: &TzOffset) -> Option<RosettaDateTime> {
168    // Flatten commas and "at" to spaces, then split into tokens
169    let cleaned = body.replace(',', " ").replace(" at ", " ").to_lowercase();
170    let toks: Vec<&str> = cleaned.split_whitespace().collect();
171    if toks.is_empty() {
172        return None;
173    }
174
175    let month = month_from_name(toks[0])?;
176
177    match toks.len() {
178        // "May 25 2021" or "May 6 9:24 PM" (after "at" removal)
179        n if n >= 2 => {
180            // Second token: day (possibly followed by ordinal suffix stripped already)
181            let day_str = strip_ordinal(toks[1]);
182            if let Ok(day) = day_str.parse::<u8>() {
183                // Third token might be year or time
184                if n == 2 {
185                    // "oct 7 70" – no year → use current year? But PLAN has "oct 7, 1970"
186                    // Minimal: just date
187                    let year = default_year();
188                    return RosettaDateTime::from_components(year, month, day, 0, 0, 0, *offset)
189                        .ok();
190                }
191                // Could be year first, then optional time
192                // or time first then year
193                let third = toks[2];
194                if let Ok(year) = expand_year(third) {
195                    // Year then optional time: "May 8, 2009 5:57:51 PM"
196                    let (h, m, s, _ampm) = parse_time_tokens(&toks[3..]);
197                    return RosettaDateTime::from_components(year, month, day, h, m, s, *offset)
198                        .ok();
199                } else if third.contains(':') || third.ends_with("am") || third.ends_with("pm") {
200                    // Time first, no year: "May 6 9:24 PM" (year unknown)
201                    let (hr, m, s, _am_pm) = parse_time_tokens(&toks[2..]);
202                    let year = default_year();
203                    return RosettaDateTime::from_components(year, month, day, hr, m, s, *offset)
204                        .ok();
205                }
206            }
207            // Fallback: try "oct 7, 70" where third is 2-digit year
208            if let Some(day_val) = toks
209                .get(1)
210                .and_then(|t| strip_ordinal(t).parse::<u8>().ok())
211                && let Some(yr_str) = toks.get(2)
212                && let Ok(yr) = expand_year(yr_str)
213            {
214                return RosettaDateTime::from_components(yr, month, day_val, 0, 0, 0, *offset).ok();
215            }
216            None
217        }
218        _ => None,
219    }
220}
221
222/// Parse "7 oct 70", "12 Feb 2006, 19:17", "14 May 2019 19:11:40.164", etc.
223fn try_day_month_year(body: &str, offset: &TzOffset) -> Option<RosettaDateTime> {
224    let cleaned = body.replace(',', " ").to_lowercase();
225    let toks: Vec<&str> = cleaned.split_whitespace().collect();
226
227    let day: u8 = toks.first()?.parse().ok()?;
228    let month = month_from_name(toks.get(1)?)?;
229    let year = expand_year(toks.get(2)?).ok()?;
230
231    let (h, m, s, _) = parse_time_tokens(toks.get(3..).unwrap_or(&[]));
232    RosettaDateTime::from_components(year, month, day, h, m, s, *offset).ok()
233}
234
235// ── Time-only ─────────────────────────────────────────────────────────
236
237fn looks_like_time_only(s: &str) -> bool {
238    // "01:06:06", "4:00pm", "6:00 AM"
239    let first = s.split_whitespace().next().unwrap_or(s);
240    let lower = first.to_lowercase();
241    let base = lower.trim_end_matches("am").trim_end_matches("pm");
242    base.contains(':')
243        && !base.contains('-')
244        && !base.contains('/')
245        && base
246            .split(':')
247            .next()
248            .map(|p| p.len() <= 2)
249            .unwrap_or(false)
250        && s.split_whitespace()
251            .next()
252            .map(|p| {
253                p.chars()
254                    .next()
255                    .map(|c| c.is_ascii_digit())
256                    .unwrap_or(false)
257            })
258            .unwrap_or(false)
259}
260
261fn try_time_only(body: &str, _default_tz: &TzOffset, offset: &TzOffset) -> Option<RosettaDateTime> {
262    let toks: Vec<&str> = body.split_whitespace().collect();
263    let (h, m, s, _) = parse_time_tokens(&toks);
264    let today = RosettaDateTime::now_utc();
265    RosettaDateTime::from_components(today.year(), today.month(), today.day(), h, m, s, *offset)
266        .ok()
267}
268
269// ── YMD + HMS core parser (after normalisation) ───────────────────────
270// Handles "yyyy-mm-dd", "yyyy-mm-dd hh:mm:ss[.frac]",
271// "2021-Feb-21" (after normalise_separators makes it "2021-Feb-21")
272
273fn parse_ymd_hms(s: &str, offset: &TzOffset) -> Option<RosettaDateTime> {
274    // Split on whitespace; first part is date, optional second part is time
275    let mut parts = s.splitn(3, ' ');
276    let date_str = parts.next()?;
277    let time_str = parts.next(); // may be None
278
279    // Split date on '-'
280    let date_segs: Vec<&str> = date_str.split('-').collect();
281
282    let (year, month, day) = match date_segs.len() {
283        3 => {
284            let (y, m, d) = parse_date_segments(&date_segs)?;
285            (y, m, d)
286        }
287        2 => {
288            // "2014.03" → month-only, day=1
289            let y: i32 = date_segs[0].parse().ok()?;
290            let m: u8 = date_segs[1].parse().ok()?;
291            (y, m, 1u8)
292        }
293        _ => return None,
294    };
295
296    let (h, min, s) = if let Some(ts) = time_str {
297        parse_time_str_with_ampm(ts)?
298    } else {
299        (0, 0, 0)
300    };
301
302    RosettaDateTime::from_components(year, month, day, h, min, s, *offset).ok()
303}
304
305/// Parse ["2021", "04", "30"] or ["2021", "Feb", "21"] etc.
306fn parse_date_segments(segs: &[&str]) -> Option<(i32, u8, u8)> {
307    debug_assert_eq!(segs.len(), 3);
308    let a = segs[0];
309    let b = segs[1];
310    let c = segs[2];
311
312    // Check if middle segment is a month name (yyyy-Mon-dd)
313    if let Some(m) = month_from_name(b) {
314        let y: i32 = expand_year(a).ok()?;
315        let d: u8 = c.parse().ok()?;
316        return Some((y, m, d));
317    }
318    if let Some(m) = month_from_name(a) {
319        // Mon-dd-yyyy
320        let d: u8 = b.parse().ok()?;
321        let y: i32 = expand_year(c).ok()?;
322        return Some((y, m, d));
323    }
324
325    let ai: i32 = a.parse().ok()?;
326    let bi: i32 = b.parse().ok()?;
327    let ci: i32 = c.parse().ok()?;
328
329    // YMD if first component > 31
330    if ai > 31 {
331        return Some((ai, bi as u8, ci as u8));
332    }
333    // MDY if last component > 31 and first <= 12
334    if ci > 31 && ai <= 12 {
335        return Some((expand_year_i32(ci), ai as u8, bi as u8));
336    }
337    // DMY if last component > 31 and first > 12
338    if ci > 31 && ai > 12 {
339        return Some((expand_year_i32(ci), bi as u8, ai as u8));
340    }
341    // 2-digit year: assume MDY (common in US formats)
342    if ci <= 99 && ai <= 12 {
343        return Some((expand_year_i32(ci), ai as u8, bi as u8));
344    }
345
346    None
347}
348
349// ── Separator normalisation ───────────────────────────────────────────
350
351fn normalise_separators(s: &str) -> String {
352    // Replace "/" and "." with "-" ONLY in the date portion (before any space).
353    // This avoids turning fractional-second dots into dashes.
354    if let Some(sp) = s.find(' ') {
355        let date_part: String = s[..sp]
356            .chars()
357            .map(|c| if c == '/' || c == '.' { '-' } else { c })
358            .collect();
359        format!("{}{}", date_part, &s[sp..])
360    } else {
361        s.chars()
362            .map(|c| if c == '/' || c == '.' { '-' } else { c })
363            .collect()
364    }
365}
366
367// ── Time string parsers ───────────────────────────────────────────────
368
369/// Parse "hh:mm:ss[.frac][am|pm]" → (h, m, s).  Fractional seconds truncated.
370fn parse_time_str(ts: &str) -> Option<(u8, u8, u8)> {
371    let lower = ts.to_lowercase();
372    let (s_stripped, is_pm) = if lower.ends_with("pm") {
373        (&ts[..ts.len() - 2].trim_end(), true)
374    } else if lower.ends_with("am") {
375        (&ts[..ts.len() - 2].trim_end(), false)
376    } else {
377        (&ts, false)
378    };
379
380    let mut segs = s_stripped.splitn(3, ':');
381    let h_str = segs.next()?;
382    let m_str = segs.next().unwrap_or("0");
383    let s_str = segs.next().unwrap_or("0");
384
385    let mut h: u8 = h_str.trim().parse().ok()?;
386    let m: u8 = m_str.trim().parse().ok()?;
387    // Strip fractional seconds
388    let s_base = s_str.split('.').next().unwrap_or("0");
389    let s: u8 = s_base.trim().parse().ok()?;
390
391    if is_pm && h != 12 {
392        h = h.saturating_add(12);
393    } else if !is_pm && h == 12 {
394        h = 0;
395    }
396
397    if h > 23 {
398        return None;
399    }
400    Some((h, m, s))
401}
402
403fn parse_time_str_with_ampm(ts: &str) -> Option<(u8, u8, u8)> {
404    // Handle combined token like "5:57:51PM" or separate "5:57:51 PM"
405    // Both handled by parse_time_str.
406    // If space-separated AM/PM: join back.
407    let toks: Vec<&str> = ts.split_whitespace().collect();
408    let unified = toks.join("");
409    parse_time_str(&unified)
410}
411
412/// Parse a slice of string tokens to extract (hour, min, sec, has_ampm).
413/// Tokens may be like ["5:57:51", "PM"] or ["10:09am"] or ["19:17"] etc.
414fn parse_time_tokens(toks: &[&str]) -> (u8, u8, u8, bool) {
415    if toks.is_empty() {
416        return (0, 0, 0, false);
417    }
418    let joined = toks.join(" ");
419    if let Some((h, m, s)) = parse_time_str(&joined) {
420        let lower = joined.to_lowercase();
421        let has = lower.contains("pm") || lower.contains("am");
422        (h, m, s, has)
423    } else {
424        (0, 0, 0, false)
425    }
426}
427
428// ── Month name helpers ────────────────────────────────────────────────
429
430fn month_from_name(s: &str) -> Option<u8> {
431    let s = s.trim_end_matches('.').to_lowercase();
432    match s.as_str() {
433        "jan" | "january" => Some(1),
434        "feb" | "february" => Some(2),
435        "mar" | "march" => Some(3),
436        "apr" | "april" => Some(4),
437        "may" => Some(5),
438        "jun" | "june" => Some(6),
439        "jul" | "july" => Some(7),
440        "aug" | "august" => Some(8),
441        "sep" | "sept" | "september" => Some(9),
442        "oct" | "october" => Some(10),
443        "nov" | "november" => Some(11),
444        "dec" | "december" => Some(12),
445        _ => None,
446    }
447}
448
449// ── Year helpers ──────────────────────────────────────────────────────
450
451fn expand_year(s: &str) -> std::result::Result<i32, ()> {
452    let n: i32 = s.parse().map_err(|_| ())?;
453    Ok(expand_year_i32(n))
454}
455
456fn expand_year_i32(n: i32) -> i32 {
457    match n {
458        0..=69 => 2000 + n,
459        70..=99 => 1900 + n,
460        _ => n,
461    }
462}
463
464fn default_year() -> i32 {
465    // Use current year as fallback for month-day-only inputs
466    RosettaDateTime::now_utc().year()
467}
468
469fn strip_ordinal(s: &str) -> &str {
470    s.trim_end_matches("st")
471        .trim_end_matches("nd")
472        .trim_end_matches("rd")
473        .trim_end_matches("th")
474}
475
476// ── Postgres offset splitter ──────────────────────────────────────────
477// Handles "2019-11-29 08:08-08" where the offset is glued to the time part
478// without a separating space.
479
480fn split_postgres_offset(input: &str) -> Option<(String, TzOffset)> {
481    // We only activate this for strings that have a date portion (contain '-')
482    // and whose time segment has a glued offset.
483    // Strategy: find the second whitespace-delimited token (time token) and
484    // look for a leading '+'/'-' that isn't part of a ':' time component.
485
486    let parts: Vec<&str> = input.splitn(2, ' ').collect();
487    if parts.len() != 2 {
488        return None;
489    }
490    let date_part = parts[0];
491    let time_and_offset = parts[1];
492
493    // Must look like a date (contains '-')
494    if !date_part.contains('-') {
495        return None;
496    }
497
498    // Find +/- in time_and_offset that is NOT at the start of the whole
499    // string and is preceded by a digit.
500    // We scan for the first occurrence of '+' or '-' that follows a digit.
501    let bytes = time_and_offset.as_bytes();
502    let mut split_pos: Option<usize> = None;
503    for i in 1..bytes.len() {
504        let b = bytes[i];
505        if (b == b'+' || b == b'-') && bytes[i - 1].is_ascii_digit() {
506            split_pos = Some(i);
507            break;
508        }
509    }
510
511    let pos = split_pos?;
512    let time_clean = &time_and_offset[..pos];
513    let offset_str = &time_and_offset[pos..];
514
515    let offset = crate::timezone::parse_timezone(offset_str).ok()?;
516    Some((format!("{} {}", date_part, time_clean), offset))
517}