Skip to main content

feedparser_rs/util/
date.rs

1//! Multi-format date parsing for RSS and Atom feeds
2
3use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
4
5/// Date format strings to try, in priority order
6///
7/// Order matters: more specific formats first, then fallbacks
8const DATE_FORMATS: &[&str] = &[
9    // ISO 8601 / RFC 3339 variants (Atom)
10    "%Y-%m-%dT%H:%M:%S%.f%:z", // 2024-12-14T10:30:45.123+00:00
11    "%Y-%m-%dT%H:%M:%S%:z",    // 2024-12-14T10:30:45+00:00
12    "%Y-%m-%dT%H:%M:%S%.fZ",   // 2024-12-14T10:30:45.123Z
13    "%Y-%m-%dT%H:%M:%SZ",      // 2024-12-14T10:30:45Z
14    "%Y-%m-%dT%H:%M:%S",       // 2024-12-14T10:30:45 (no timezone)
15    "%Y-%m-%d %H:%M:%S",       // 2024-12-14 10:30:45
16    "%Y-%m-%d",                // 2024-12-14
17    // W3C Date-Time variants
18    "%Y-%m-%d %H:%M:%S%:z", // 2024-12-14 10:30:45+00:00
19    "%Y/%m/%d %H:%M:%S",    // 2024/12/14 10:30:45
20    "%Y/%m/%d",             // 2024/12/14
21    // RFC 822 variants (RSS pubDate)
22    "%d %b %Y %H:%M:%S", // 14 Dec 2024 10:30:45
23    "%d %b %Y",          // 14 Dec 2024
24    "%d %B %Y %H:%M:%S", // 14 December 2024 10:30:45
25    "%d %B %Y",          // 14 December 2024
26    // US date formats
27    "%B %d, %Y %H:%M:%S", // December 14, 2024 10:30:45
28    "%B %d, %Y",          // December 14, 2024
29    "%b %d, %Y %H:%M:%S", // Dec 14, 2024 10:30:45
30    "%b %d, %Y",          // Dec 14, 2024
31    "%m/%d/%Y %H:%M:%S",  // 12/14/2024 10:30:45
32    "%m/%d/%Y",           // 12/14/2024
33    "%m-%d-%Y",           // 12-14-2024
34    // EU date formats
35    "%d.%m.%Y %H:%M:%S", // 14.12.2024 10:30:45
36    "%d.%m.%Y",          // 14.12.2024
37    "%d/%m/%Y %H:%M:%S", // 14/12/2024 10:30:45
38    "%d/%m/%Y",          // 14/12/2024
39    "%d-%b-%Y",          // 14-Dec-2024
40    "%d-%B-%Y",          // 14-December-2024
41];
42
43/// Strip a leading weekday prefix of the form `"Www, "` (3 ASCII alpha chars + ", ").
44fn strip_weekday_prefix(s: &str) -> Option<&str> {
45    let b = s.as_bytes();
46    if b.len() > 5 && b[3] == b',' && b[4] == b' ' && b[..3].iter().all(u8::is_ascii_alphabetic) {
47        Some(&s[5..])
48    } else {
49        None
50    }
51}
52
53/// Parse date from string, trying multiple formats
54///
55/// This function attempts to parse dates in the following order:
56/// 1. RFC 3339 (Atom standard: 2024-12-14T10:30:00Z)
57/// 2. RFC 2822 (RSS standard: Sat, 14 Dec 2024 10:30:00 +0000)
58/// 3. Common format strings (ISO 8601 variants, US/EU formats)
59///
60/// # Arguments
61///
62/// * `input` - Date string to parse
63///
64/// # Returns
65///
66/// * `Some(DateTime<Utc>)` - Successfully parsed date
67/// * `None` - Could not parse date
68///
69/// # Examples
70///
71/// ```
72/// use feedparser_rs::util::date::parse_date;
73///
74/// // RFC 3339 (Atom)
75/// assert!(parse_date("2024-12-14T10:30:00Z").is_some());
76///
77/// // RFC 2822 (RSS)
78/// assert!(parse_date("Sat, 14 Dec 2024 10:30:00 +0000").is_some());
79///
80/// // ISO 8601 date-only
81/// assert!(parse_date("2024-12-14").is_some());
82///
83/// // Invalid date
84/// assert!(parse_date("not a date").is_none());
85/// ```
86#[must_use]
87pub fn parse_date(input: &str) -> Option<DateTime<Utc>> {
88    let input = input.trim();
89
90    if input.is_empty() {
91        return None;
92    }
93
94    // Try RFC 3339 first (most common in Atom)
95    if let Ok(dt) = DateTime::parse_from_rfc3339(input) {
96        return Some(dt.with_timezone(&Utc));
97    }
98
99    // Try RFC 2822 (RSS pubDate format)
100    if let Ok(dt) = DateTime::parse_from_rfc2822(input) {
101        return Some(dt.with_timezone(&Utc));
102    }
103
104    // Retry RFC 2822 with weekday prefix stripped — chrono validates the weekday
105    // strictly, but Python feedparser accepts wrong day-of-week names (#143)
106    if let Some(stripped) = strip_weekday_prefix(input)
107        && let Ok(dt) = DateTime::parse_from_rfc2822(stripped)
108    {
109        return Some(dt.with_timezone(&Utc));
110    }
111
112    // Special handling for year-only format (e.g., "2024")
113    if let Ok(year) = input.parse::<i32>()
114        && (1000..=9999).contains(&year)
115    {
116        return NaiveDate::from_ymd_opt(year, 1, 1)
117            .and_then(|d| d.and_hms_opt(0, 0, 0))
118            .map(|dt| dt.and_utc());
119    }
120
121    // Special handling for year-month format (e.g., "2024-12")
122    if input.len() == 7
123        && input.chars().nth(4) == Some('-')
124        && let (Ok(year), Ok(month)) = (input[..4].parse::<i32>(), input[5..7].parse::<u32>())
125        && (1000..=9999).contains(&year)
126        && (1..=12).contains(&month)
127    {
128        return NaiveDate::from_ymd_opt(year, month, 1)
129            .and_then(|d| d.and_hms_opt(0, 0, 0))
130            .map(|dt| dt.and_utc());
131    }
132
133    // Try all format strings
134    for fmt in DATE_FORMATS {
135        // Try parsing with time component
136        if let Ok(dt) = NaiveDateTime::parse_from_str(input, fmt) {
137            return Some(dt.and_utc());
138        }
139
140        // Try parsing date-only, assume midnight UTC
141        if let Ok(date) = NaiveDate::parse_from_str(input, fmt) {
142            return date.and_hms_opt(0, 0, 0).map(|dt| dt.and_utc());
143        }
144    }
145
146    // Could not parse
147    None
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153    use chrono::{Datelike, Timelike};
154
155    #[test]
156    fn test_rfc3339_with_timezone() {
157        let dt = parse_date("2024-12-14T10:30:00+00:00");
158        assert!(dt.is_some());
159        let dt = dt.unwrap();
160        assert_eq!(dt.year(), 2024);
161        assert_eq!(dt.month(), 12);
162        assert_eq!(dt.day(), 14);
163        assert_eq!(dt.hour(), 10);
164        assert_eq!(dt.minute(), 30);
165    }
166
167    #[test]
168    fn test_rfc3339_z_suffix() {
169        let dt = parse_date("2024-12-14T10:30:00Z");
170        assert!(dt.is_some());
171        assert_eq!(dt.unwrap().year(), 2024);
172    }
173
174    #[test]
175    fn test_rfc3339_with_milliseconds() {
176        let dt = parse_date("2024-12-14T10:30:00.123Z");
177        assert!(dt.is_some());
178    }
179
180    #[test]
181    fn test_rfc2822_format() {
182        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 +0000");
183        assert!(dt.is_some());
184        let dt = dt.unwrap();
185        assert_eq!(dt.year(), 2024);
186        assert_eq!(dt.month(), 12);
187    }
188
189    #[test]
190    fn test_rfc2822_gmt() {
191        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 GMT");
192        assert!(dt.is_some());
193    }
194
195    #[test]
196    fn test_iso8601_date_only() {
197        let dt = parse_date("2024-12-14");
198        assert!(dt.is_some());
199        let dt = dt.unwrap();
200        assert_eq!(dt.year(), 2024);
201        assert_eq!(dt.month(), 12);
202        assert_eq!(dt.day(), 14);
203        assert_eq!(dt.hour(), 0); // Midnight
204    }
205
206    #[test]
207    fn test_us_format_long_month() {
208        let dt = parse_date("December 14, 2024");
209        assert!(dt.is_some());
210    }
211
212    #[test]
213    fn test_us_format_short_month() {
214        let dt = parse_date("Dec 14, 2024");
215        assert!(dt.is_some());
216    }
217
218    #[test]
219    fn test_invalid_date() {
220        let dt = parse_date("not a date");
221        assert!(dt.is_none());
222    }
223
224    #[test]
225    fn test_empty_string() {
226        let dt = parse_date("");
227        assert!(dt.is_none());
228    }
229
230    #[test]
231    fn test_whitespace_only() {
232        let dt = parse_date("   ");
233        assert!(dt.is_none());
234    }
235
236    #[test]
237    fn test_partial_date_invalid() {
238        // Invalid partial dates should fail
239        let dt = parse_date("2024-13"); // Invalid month
240        assert!(dt.is_none());
241        let dt = parse_date("abcd-12");
242        assert!(dt.is_none());
243    }
244
245    #[test]
246    fn test_us_date_slash_format() {
247        let dt = parse_date("12/14/2024");
248        assert!(dt.is_some());
249    }
250
251    #[test]
252    fn test_eu_date_dot_format() {
253        let dt = parse_date("14.12.2024");
254        assert!(dt.is_some());
255    }
256
257    #[test]
258    fn test_rfc822_without_day() {
259        let dt = parse_date("14 Dec 2024");
260        assert!(dt.is_some());
261    }
262
263    #[test]
264    fn test_rfc822_long_month() {
265        let dt = parse_date("14 December 2024");
266        assert!(dt.is_some());
267    }
268
269    #[test]
270    fn test_year_slash_format() {
271        let dt = parse_date("2024/12/14");
272        assert!(dt.is_some());
273    }
274
275    #[test]
276    fn test_dash_month_format() {
277        let dt = parse_date("14-Dec-2024");
278        assert!(dt.is_some());
279    }
280
281    #[test]
282    fn test_us_dash_format() {
283        let dt = parse_date("12-14-2024");
284        assert!(dt.is_some());
285    }
286
287    #[test]
288    fn test_eu_slash_with_time() {
289        let dt = parse_date("14/12/2024 10:30:45");
290        assert!(dt.is_some());
291    }
292
293    #[test]
294    fn test_multiple_formats_dont_panic() {
295        let dates = vec![
296            "2024-12-14T10:30:00Z",
297            "Sat, 14 Dec 2024 10:30:00 GMT",
298            "14 Dec 2024",
299            "December 14, 2024",
300            "12/14/2024",
301            "14.12.2024",
302            "2024/12/14",
303            "14-Dec-2024",
304            "not a date",
305            "",
306            "2024",
307            "12/2024",
308        ];
309
310        for date_str in dates {
311            let _ = parse_date(date_str);
312        }
313    }
314
315    #[test]
316    fn test_rfc2822_wrong_weekday() {
317        // Mon is wrong (actual day is Thu), but date should still parse (#143)
318        let dt = parse_date("Mon, 15 Jan 2026 10:30:00 +0000").unwrap();
319        assert_eq!(dt.year(), 2026);
320        assert_eq!(dt.month(), 1);
321        assert_eq!(dt.day(), 15);
322        assert_eq!(dt.hour(), 10);
323    }
324
325    #[test]
326    fn test_rfc2822_wrong_weekday_new_year() {
327        // Wed is wrong (actual day is Thu), but date should still parse (#143)
328        let dt = parse_date("Wed, 01 Jan 2026 00:00:00 +0000").unwrap();
329        assert_eq!(dt.year(), 2026);
330        assert_eq!(dt.month(), 1);
331        assert_eq!(dt.day(), 1);
332    }
333
334    #[test]
335    fn test_rfc2822_correct_weekday() {
336        // Thu is correct for 2026-01-15
337        let dt = parse_date("Thu, 15 Jan 2026 10:30:00 +0000").unwrap();
338        assert_eq!(dt.year(), 2026);
339        assert_eq!(dt.month(), 1);
340        assert_eq!(dt.day(), 15);
341    }
342
343    #[test]
344    fn test_rfc2822_no_weekday() {
345        let dt = parse_date("15 Jan 2026 10:30:00 +0000").unwrap();
346        assert_eq!(dt.year(), 2026);
347        assert_eq!(dt.month(), 1);
348        assert_eq!(dt.day(), 15);
349    }
350
351    #[test]
352    fn test_edge_case_leap_year() {
353        let dt = parse_date("2024-02-29");
354        assert!(dt.is_some());
355    }
356
357    #[test]
358    fn test_edge_case_invalid_date() {
359        let dt = parse_date("2023-02-29");
360        assert!(dt.is_none());
361    }
362
363    #[test]
364    fn test_year_only_format() {
365        let dt = parse_date("2024").unwrap();
366        assert_eq!(dt.year(), 2024);
367        assert_eq!(dt.month(), 1);
368        assert_eq!(dt.day(), 1);
369        assert_eq!(dt.hour(), 0);
370    }
371
372    #[test]
373    fn test_year_month_format() {
374        let dt = parse_date("2024-12").unwrap();
375        assert_eq!(dt.year(), 2024);
376        assert_eq!(dt.month(), 12);
377        assert_eq!(dt.day(), 1);
378        assert_eq!(dt.hour(), 0);
379    }
380
381    #[test]
382    fn test_all_format_strings() {
383        // (input, expected_year, expected_month, expected_day)
384        let cases: &[(&str, i32, u32, u32)] = &[
385            // ISO 8601 / RFC 3339 variants
386            ("2024-12-14T10:30:45.123+00:00", 2024, 12, 14),
387            ("2024-12-14T10:30:45+00:00", 2024, 12, 14),
388            ("2024-12-14T10:30:45.123Z", 2024, 12, 14),
389            ("2024-12-14T10:30:45Z", 2024, 12, 14),
390            ("2024-12-14T10:30:45", 2024, 12, 14),
391            ("2024-12-14 10:30:45", 2024, 12, 14),
392            ("2024-12-14", 2024, 12, 14),
393            // W3C Date-Time variants
394            ("2024-12-14 10:30:45+00:00", 2024, 12, 14),
395            ("2024/12/14 10:30:45", 2024, 12, 14),
396            ("2024/12/14", 2024, 12, 14),
397            // RFC 822 variants
398            ("14 Dec 2024 10:30:45", 2024, 12, 14),
399            ("14 Dec 2024", 2024, 12, 14),
400            ("14 December 2024 10:30:45", 2024, 12, 14),
401            ("14 December 2024", 2024, 12, 14),
402            // US date formats
403            ("December 14, 2024 10:30:45", 2024, 12, 14),
404            ("December 14, 2024", 2024, 12, 14),
405            ("Dec 14, 2024 10:30:45", 2024, 12, 14),
406            ("Dec 14, 2024", 2024, 12, 14),
407            ("12/14/2024 10:30:45", 2024, 12, 14),
408            ("12/14/2024", 2024, 12, 14),
409            ("12-14-2024", 2024, 12, 14),
410            // EU date formats
411            ("14.12.2024 10:30:45", 2024, 12, 14),
412            ("14.12.2024", 2024, 12, 14),
413            ("14/12/2024 10:30:45", 2024, 12, 14),
414            ("14/12/2024", 2024, 12, 14),
415            ("14-Dec-2024", 2024, 12, 14),
416            ("14-December-2024", 2024, 12, 14),
417            // Special cases
418            ("2024", 2024, 1, 1),
419            ("2024-12", 2024, 12, 1),
420        ];
421
422        for &(input, year, month, day) in cases {
423            let dt = parse_date(input).unwrap_or_else(|| panic!("Failed to parse: {input}"));
424            assert_eq!(dt.year(), year, "Year mismatch for: {input}");
425            assert_eq!(dt.month(), month, "Month mismatch for: {input}");
426            assert_eq!(dt.day(), day, "Day mismatch for: {input}");
427        }
428    }
429}