Skip to main content

feedparser_rs/util/
date.rs

1//! Multi-format date parsing for RSS and Atom feeds
2
3use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
4
5/// Date format strings to try, in priority order
6///
7/// Order matters: more specific formats first, then fallbacks
8const DATE_FORMATS: &[&str] = &[
9    // ISO 8601 / RFC 3339 variants (Atom)
10    "%Y-%m-%dT%H:%M:%S%.f%:z", // 2024-12-14T10:30:45.123+00:00
11    "%Y-%m-%dT%H:%M:%S%:z",    // 2024-12-14T10:30:45+00:00
12    "%Y-%m-%dT%H:%M:%S%.fZ",   // 2024-12-14T10:30:45.123Z
13    "%Y-%m-%dT%H:%M:%SZ",      // 2024-12-14T10:30:45Z
14    "%Y-%m-%dT%H:%M:%S",       // 2024-12-14T10:30:45 (no timezone)
15    "%Y-%m-%d %H:%M:%S",       // 2024-12-14 10:30:45
16    "%Y-%m-%d",                // 2024-12-14
17    // W3C Date-Time variants
18    "%Y-%m-%d %H:%M:%S%:z", // 2024-12-14 10:30:45+00:00
19    "%Y/%m/%d %H:%M:%S",    // 2024/12/14 10:30:45
20    "%Y/%m/%d",             // 2024/12/14
21    // RFC 822 variants (RSS pubDate)
22    "%d %b %Y %H:%M:%S", // 14 Dec 2024 10:30:45
23    "%d %b %Y",          // 14 Dec 2024
24    "%d %B %Y %H:%M:%S", // 14 December 2024 10:30:45
25    "%d %B %Y",          // 14 December 2024
26    // US date formats
27    "%B %d, %Y %H:%M:%S", // December 14, 2024 10:30:45
28    "%B %d, %Y",          // December 14, 2024
29    "%b %d, %Y %H:%M:%S", // Dec 14, 2024 10:30:45
30    "%b %d, %Y",          // Dec 14, 2024
31    "%m/%d/%Y %H:%M:%S",  // 12/14/2024 10:30:45
32    "%m/%d/%Y",           // 12/14/2024
33    "%m-%d-%Y",           // 12-14-2024
34    // EU date formats
35    "%d.%m.%Y %H:%M:%S", // 14.12.2024 10:30:45
36    "%d.%m.%Y",          // 14.12.2024
37    "%d/%m/%Y %H:%M:%S", // 14/12/2024 10:30:45
38    "%d/%m/%Y",          // 14/12/2024
39    "%d-%b-%Y",          // 14-Dec-2024
40    "%d-%B-%Y",          // 14-December-2024
41];
42
43/// Parse ASCTIME format: `Www Mmm [D]D HH:MM:SS YYYY` where DD may have a leading space.
44///
45/// Example: `Mon Jan  6 12:30:00 2025` or `Mon Jan 16 12:30:00 2025`
46fn parse_asctime(s: &str) -> Option<NaiveDateTime> {
47    // Expected: 3 alpha (weekday) + space + 3 alpha (month) + space + 1-2 digit day
48    //           (possibly space-padded) + space + HH:MM:SS + space + YYYY
49    let b = s.as_bytes();
50    if b.len() < 24 {
51        return None;
52    }
53    // Weekday: bytes 0..3 must be alpha
54    if !b[..3].iter().all(u8::is_ascii_alphabetic) || b[3] != b' ' {
55        return None;
56    }
57    // Strip weekday prefix
58    let rest = &s[4..];
59    // Normalize: collapse double-space before single-digit day to single space
60    // "Jan  6" → "Jan 6"
61    let normalized = if rest.len() > 4 && rest.as_bytes()[4] == b' ' && rest.as_bytes()[3] == b' ' {
62        // "Mmm  D ..." → "Mmm D ..."
63        let mut n = String::with_capacity(rest.len());
64        n.push_str(&rest[..3]); // month
65        n.push(' ');
66        n.push_str(rest[4..].trim_start_matches(' '));
67        n
68    } else {
69        rest.to_string()
70    };
71    NaiveDateTime::parse_from_str(&normalized, "%b %e %H:%M:%S %Y")
72        .or_else(|_| NaiveDateTime::parse_from_str(&normalized, "%b %d %H:%M:%S %Y"))
73        .ok()
74}
75
76/// Strip a leading weekday prefix of the form `"Www, "` (3 ASCII alpha chars + ", ").
77fn strip_weekday_prefix(s: &str) -> Option<&str> {
78    let b = s.as_bytes();
79    if b.len() > 5 && b[3] == b',' && b[4] == b' ' && b[..3].iter().all(u8::is_ascii_alphabetic) {
80        Some(&s[5..])
81    } else {
82        None
83    }
84}
85
86/// Parse date from string, trying multiple formats
87///
88/// This function attempts to parse dates in the following order:
89/// 1. RFC 3339 (Atom standard: 2024-12-14T10:30:00Z)
90/// 2. RFC 2822 (RSS standard: Sat, 14 Dec 2024 10:30:00 +0000)
91/// 3. Common format strings (ISO 8601 variants, US/EU formats)
92///
93/// # Arguments
94///
95/// * `input` - Date string to parse
96///
97/// # Returns
98///
99/// * `Some(DateTime<Utc>)` - Successfully parsed date
100/// * `None` - Could not parse date
101///
102/// # Examples
103///
104/// ```
105/// use feedparser_rs::util::date::parse_date;
106///
107/// // RFC 3339 (Atom)
108/// assert!(parse_date("2024-12-14T10:30:00Z").is_some());
109///
110/// // RFC 2822 (RSS)
111/// assert!(parse_date("Sat, 14 Dec 2024 10:30:00 +0000").is_some());
112///
113/// // ISO 8601 date-only
114/// assert!(parse_date("2024-12-14").is_some());
115///
116/// // Invalid date
117/// assert!(parse_date("not a date").is_none());
118/// ```
119#[must_use]
120pub fn parse_date(input: &str) -> Option<DateTime<Utc>> {
121    let input = input.trim();
122
123    if input.is_empty() {
124        return None;
125    }
126
127    // Try RFC 3339 first (most common in Atom)
128    if let Ok(dt) = DateTime::parse_from_rfc3339(input) {
129        return Some(dt.with_timezone(&Utc));
130    }
131
132    // Try RFC 2822 (RSS pubDate format)
133    if let Ok(dt) = DateTime::parse_from_rfc2822(input) {
134        return Some(dt.with_timezone(&Utc));
135    }
136
137    // Retry RFC 2822 with weekday prefix stripped — chrono validates the weekday
138    // strictly, but Python feedparser accepts wrong day-of-week names (#143)
139    if let Some(stripped) = strip_weekday_prefix(input)
140        && let Ok(dt) = DateTime::parse_from_rfc2822(stripped)
141    {
142        return Some(dt.with_timezone(&Utc));
143    }
144
145    // Special handling for year-only format (e.g., "2024")
146    if let Ok(year) = input.parse::<i32>()
147        && (1000..=9999).contains(&year)
148    {
149        return NaiveDate::from_ymd_opt(year, 1, 1)
150            .and_then(|d| d.and_hms_opt(0, 0, 0))
151            .map(|dt| dt.and_utc());
152    }
153
154    // Special handling for year-month format (e.g., "2024-12")
155    if input.len() == 7
156        && input.chars().nth(4) == Some('-')
157        && let (Ok(year), Ok(month)) = (input[..4].parse::<i32>(), input[5..7].parse::<u32>())
158        && (1000..=9999).contains(&year)
159        && (1..=12).contains(&month)
160    {
161        return NaiveDate::from_ymd_opt(year, month, 1)
162            .and_then(|d| d.and_hms_opt(0, 0, 0))
163            .map(|dt| dt.and_utc());
164    }
165
166    // Try ASCTIME format: "Mon Jan  6 12:30:00 2025"
167    if let Some(dt) = parse_asctime(input) {
168        return Some(dt.and_utc());
169    }
170
171    // Try all format strings
172    for fmt in DATE_FORMATS {
173        // Try parsing with time component
174        if let Ok(dt) = NaiveDateTime::parse_from_str(input, fmt) {
175            return Some(dt.and_utc());
176        }
177
178        // Try parsing date-only, assume midnight UTC
179        if let Ok(date) = NaiveDate::parse_from_str(input, fmt) {
180            return date.and_hms_opt(0, 0, 0).map(|dt| dt.and_utc());
181        }
182    }
183
184    // Could not parse
185    None
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191    use chrono::{Datelike, Timelike};
192
193    #[test]
194    fn test_rfc3339_with_timezone() {
195        let dt = parse_date("2024-12-14T10:30:00+00:00");
196        assert!(dt.is_some());
197        let dt = dt.unwrap();
198        assert_eq!(dt.year(), 2024);
199        assert_eq!(dt.month(), 12);
200        assert_eq!(dt.day(), 14);
201        assert_eq!(dt.hour(), 10);
202        assert_eq!(dt.minute(), 30);
203    }
204
205    #[test]
206    fn test_rfc3339_z_suffix() {
207        let dt = parse_date("2024-12-14T10:30:00Z");
208        assert!(dt.is_some());
209        assert_eq!(dt.unwrap().year(), 2024);
210    }
211
212    #[test]
213    fn test_rfc3339_with_milliseconds() {
214        let dt = parse_date("2024-12-14T10:30:00.123Z");
215        assert!(dt.is_some());
216    }
217
218    #[test]
219    fn test_rfc2822_format() {
220        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 +0000");
221        assert!(dt.is_some());
222        let dt = dt.unwrap();
223        assert_eq!(dt.year(), 2024);
224        assert_eq!(dt.month(), 12);
225    }
226
227    #[test]
228    fn test_rfc2822_gmt() {
229        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 GMT");
230        assert!(dt.is_some());
231    }
232
233    #[test]
234    fn test_iso8601_date_only() {
235        let dt = parse_date("2024-12-14");
236        assert!(dt.is_some());
237        let dt = dt.unwrap();
238        assert_eq!(dt.year(), 2024);
239        assert_eq!(dt.month(), 12);
240        assert_eq!(dt.day(), 14);
241        assert_eq!(dt.hour(), 0); // Midnight
242    }
243
244    #[test]
245    fn test_us_format_long_month() {
246        let dt = parse_date("December 14, 2024");
247        assert!(dt.is_some());
248    }
249
250    #[test]
251    fn test_us_format_short_month() {
252        let dt = parse_date("Dec 14, 2024");
253        assert!(dt.is_some());
254    }
255
256    #[test]
257    fn test_invalid_date() {
258        let dt = parse_date("not a date");
259        assert!(dt.is_none());
260    }
261
262    #[test]
263    fn test_empty_string() {
264        let dt = parse_date("");
265        assert!(dt.is_none());
266    }
267
268    #[test]
269    fn test_whitespace_only() {
270        let dt = parse_date("   ");
271        assert!(dt.is_none());
272    }
273
274    #[test]
275    fn test_partial_date_invalid() {
276        // Invalid partial dates should fail
277        let dt = parse_date("2024-13"); // Invalid month
278        assert!(dt.is_none());
279        let dt = parse_date("abcd-12");
280        assert!(dt.is_none());
281    }
282
283    #[test]
284    fn test_us_date_slash_format() {
285        let dt = parse_date("12/14/2024");
286        assert!(dt.is_some());
287    }
288
289    #[test]
290    fn test_eu_date_dot_format() {
291        let dt = parse_date("14.12.2024");
292        assert!(dt.is_some());
293    }
294
295    #[test]
296    fn test_rfc822_without_day() {
297        let dt = parse_date("14 Dec 2024");
298        assert!(dt.is_some());
299    }
300
301    #[test]
302    fn test_rfc822_long_month() {
303        let dt = parse_date("14 December 2024");
304        assert!(dt.is_some());
305    }
306
307    #[test]
308    fn test_year_slash_format() {
309        let dt = parse_date("2024/12/14");
310        assert!(dt.is_some());
311    }
312
313    #[test]
314    fn test_dash_month_format() {
315        let dt = parse_date("14-Dec-2024");
316        assert!(dt.is_some());
317    }
318
319    #[test]
320    fn test_us_dash_format() {
321        let dt = parse_date("12-14-2024");
322        assert!(dt.is_some());
323    }
324
325    #[test]
326    fn test_eu_slash_with_time() {
327        let dt = parse_date("14/12/2024 10:30:45");
328        assert!(dt.is_some());
329    }
330
331    #[test]
332    fn test_multiple_formats_dont_panic() {
333        let dates = vec![
334            "2024-12-14T10:30:00Z",
335            "Sat, 14 Dec 2024 10:30:00 GMT",
336            "14 Dec 2024",
337            "December 14, 2024",
338            "12/14/2024",
339            "14.12.2024",
340            "2024/12/14",
341            "14-Dec-2024",
342            "not a date",
343            "",
344            "2024",
345            "12/2024",
346        ];
347
348        for date_str in dates {
349            let _ = parse_date(date_str);
350        }
351    }
352
353    #[test]
354    fn test_rfc2822_wrong_weekday() {
355        // Mon is wrong (actual day is Thu), but date should still parse (#143)
356        let dt = parse_date("Mon, 15 Jan 2026 10:30:00 +0000").unwrap();
357        assert_eq!(dt.year(), 2026);
358        assert_eq!(dt.month(), 1);
359        assert_eq!(dt.day(), 15);
360        assert_eq!(dt.hour(), 10);
361    }
362
363    #[test]
364    fn test_rfc2822_wrong_weekday_new_year() {
365        // Wed is wrong (actual day is Thu), but date should still parse (#143)
366        let dt = parse_date("Wed, 01 Jan 2026 00:00:00 +0000").unwrap();
367        assert_eq!(dt.year(), 2026);
368        assert_eq!(dt.month(), 1);
369        assert_eq!(dt.day(), 1);
370    }
371
372    #[test]
373    fn test_rfc2822_correct_weekday() {
374        // Thu is correct for 2026-01-15
375        let dt = parse_date("Thu, 15 Jan 2026 10:30:00 +0000").unwrap();
376        assert_eq!(dt.year(), 2026);
377        assert_eq!(dt.month(), 1);
378        assert_eq!(dt.day(), 15);
379    }
380
381    #[test]
382    fn test_rfc2822_no_weekday() {
383        let dt = parse_date("15 Jan 2026 10:30:00 +0000").unwrap();
384        assert_eq!(dt.year(), 2026);
385        assert_eq!(dt.month(), 1);
386        assert_eq!(dt.day(), 15);
387    }
388
389    #[test]
390    fn test_edge_case_leap_year() {
391        let dt = parse_date("2024-02-29");
392        assert!(dt.is_some());
393    }
394
395    #[test]
396    fn test_edge_case_invalid_date() {
397        let dt = parse_date("2023-02-29");
398        assert!(dt.is_none());
399    }
400
401    #[test]
402    fn test_year_only_format() {
403        let dt = parse_date("2024").unwrap();
404        assert_eq!(dt.year(), 2024);
405        assert_eq!(dt.month(), 1);
406        assert_eq!(dt.day(), 1);
407        assert_eq!(dt.hour(), 0);
408    }
409
410    #[test]
411    fn test_year_month_format() {
412        let dt = parse_date("2024-12").unwrap();
413        assert_eq!(dt.year(), 2024);
414        assert_eq!(dt.month(), 12);
415        assert_eq!(dt.day(), 1);
416        assert_eq!(dt.hour(), 0);
417    }
418
419    #[test]
420    fn test_all_format_strings() {
421        // (input, expected_year, expected_month, expected_day)
422        let cases: &[(&str, i32, u32, u32)] = &[
423            // ISO 8601 / RFC 3339 variants
424            ("2024-12-14T10:30:45.123+00:00", 2024, 12, 14),
425            ("2024-12-14T10:30:45+00:00", 2024, 12, 14),
426            ("2024-12-14T10:30:45.123Z", 2024, 12, 14),
427            ("2024-12-14T10:30:45Z", 2024, 12, 14),
428            ("2024-12-14T10:30:45", 2024, 12, 14),
429            ("2024-12-14 10:30:45", 2024, 12, 14),
430            ("2024-12-14", 2024, 12, 14),
431            // W3C Date-Time variants
432            ("2024-12-14 10:30:45+00:00", 2024, 12, 14),
433            ("2024/12/14 10:30:45", 2024, 12, 14),
434            ("2024/12/14", 2024, 12, 14),
435            // RFC 822 variants
436            ("14 Dec 2024 10:30:45", 2024, 12, 14),
437            ("14 Dec 2024", 2024, 12, 14),
438            ("14 December 2024 10:30:45", 2024, 12, 14),
439            ("14 December 2024", 2024, 12, 14),
440            // US date formats
441            ("December 14, 2024 10:30:45", 2024, 12, 14),
442            ("December 14, 2024", 2024, 12, 14),
443            ("Dec 14, 2024 10:30:45", 2024, 12, 14),
444            ("Dec 14, 2024", 2024, 12, 14),
445            ("12/14/2024 10:30:45", 2024, 12, 14),
446            ("12/14/2024", 2024, 12, 14),
447            ("12-14-2024", 2024, 12, 14),
448            // EU date formats
449            ("14.12.2024 10:30:45", 2024, 12, 14),
450            ("14.12.2024", 2024, 12, 14),
451            ("14/12/2024 10:30:45", 2024, 12, 14),
452            ("14/12/2024", 2024, 12, 14),
453            ("14-Dec-2024", 2024, 12, 14),
454            ("14-December-2024", 2024, 12, 14),
455            // Special cases
456            ("2024", 2024, 1, 1),
457            ("2024-12", 2024, 12, 1),
458        ];
459
460        for &(input, year, month, day) in cases {
461            let dt = parse_date(input).unwrap_or_else(|| panic!("Failed to parse: {input}"));
462            assert_eq!(dt.year(), year, "Year mismatch for: {input}");
463            assert_eq!(dt.month(), month, "Month mismatch for: {input}");
464            assert_eq!(dt.day(), day, "Day mismatch for: {input}");
465        }
466    }
467
468    #[test]
469    fn test_asctime_single_digit_day_space_padded() {
470        // Bug #258: "Mon Jan  6 12:30:00 2025" — day padded with space
471        let dt = parse_date("Mon Jan  6 12:30:00 2025").unwrap();
472        assert_eq!(dt.year(), 2025);
473        assert_eq!(dt.month(), 1);
474        assert_eq!(dt.day(), 6);
475        assert_eq!(dt.hour(), 12);
476        assert_eq!(dt.minute(), 30);
477        assert_eq!(dt.second(), 0);
478    }
479
480    #[test]
481    fn test_asctime_double_digit_day() {
482        let dt = parse_date("Mon Jan 16 12:30:00 2025").unwrap();
483        assert_eq!(dt.year(), 2025);
484        assert_eq!(dt.month(), 1);
485        assert_eq!(dt.day(), 16);
486    }
487
488    #[test]
489    fn test_asctime_various_months() {
490        let dt = parse_date("Fri Dec 31 23:59:59 2021").unwrap();
491        assert_eq!(dt.year(), 2021);
492        assert_eq!(dt.month(), 12);
493        assert_eq!(dt.day(), 31);
494    }
495}