feedparser_rs/util/
date.rs

1//! Multi-format date parsing for RSS and Atom feeds
2
3use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
4
5/// Date format strings to try, in priority order
6///
7/// Order matters: more specific formats first, then fallbacks
8const DATE_FORMATS: &[&str] = &[
9    // ISO 8601 / RFC 3339 variants (Atom)
10    "%Y-%m-%dT%H:%M:%S%.f%:z", // 2024-12-14T10:30:45.123+00:00
11    "%Y-%m-%dT%H:%M:%S%:z",    // 2024-12-14T10:30:45+00:00
12    "%Y-%m-%dT%H:%M:%S%.fZ",   // 2024-12-14T10:30:45.123Z
13    "%Y-%m-%dT%H:%M:%SZ",      // 2024-12-14T10:30:45Z
14    "%Y-%m-%dT%H:%M:%S",       // 2024-12-14T10:30:45 (no timezone)
15    "%Y-%m-%d %H:%M:%S",       // 2024-12-14 10:30:45
16    "%Y-%m-%d",                // 2024-12-14
17    // W3C Date-Time variants
18    "%Y-%m-%d %H:%M:%S%:z", // 2024-12-14 10:30:45+00:00
19    "%Y/%m/%d %H:%M:%S",    // 2024/12/14 10:30:45
20    "%Y/%m/%d",             // 2024/12/14
21    // RFC 822 variants (RSS pubDate)
22    "%d %b %Y %H:%M:%S", // 14 Dec 2024 10:30:45
23    "%d %b %Y",          // 14 Dec 2024
24    "%d %B %Y %H:%M:%S", // 14 December 2024 10:30:45
25    "%d %B %Y",          // 14 December 2024
26    // US date formats
27    "%B %d, %Y %H:%M:%S", // December 14, 2024 10:30:45
28    "%B %d, %Y",          // December 14, 2024
29    "%b %d, %Y %H:%M:%S", // Dec 14, 2024 10:30:45
30    "%b %d, %Y",          // Dec 14, 2024
31    "%m/%d/%Y %H:%M:%S",  // 12/14/2024 10:30:45
32    "%m/%d/%Y",           // 12/14/2024
33    "%m-%d-%Y",           // 12-14-2024
34    // EU date formats
35    "%d.%m.%Y %H:%M:%S", // 14.12.2024 10:30:45
36    "%d.%m.%Y",          // 14.12.2024
37    "%d/%m/%Y %H:%M:%S", // 14/12/2024 10:30:45
38    "%d/%m/%Y",          // 14/12/2024
39    "%d-%b-%Y",          // 14-Dec-2024
40    "%d-%B-%Y",          // 14-December-2024
41];
42
43/// Parse date from string, trying multiple formats
44///
45/// This function attempts to parse dates in the following order:
46/// 1. RFC 3339 (Atom standard: 2024-12-14T10:30:00Z)
47/// 2. RFC 2822 (RSS standard: Sat, 14 Dec 2024 10:30:00 +0000)
48/// 3. Common format strings (ISO 8601 variants, US/EU formats)
49///
50/// # Arguments
51///
52/// * `input` - Date string to parse
53///
54/// # Returns
55///
56/// * `Some(DateTime<Utc>)` - Successfully parsed date
57/// * `None` - Could not parse date
58///
59/// # Examples
60///
61/// ```
62/// use feedparser_rs::util::date::parse_date;
63///
64/// // RFC 3339 (Atom)
65/// assert!(parse_date("2024-12-14T10:30:00Z").is_some());
66///
67/// // RFC 2822 (RSS)
68/// assert!(parse_date("Sat, 14 Dec 2024 10:30:00 +0000").is_some());
69///
70/// // ISO 8601 date-only
71/// assert!(parse_date("2024-12-14").is_some());
72///
73/// // Invalid date
74/// assert!(parse_date("not a date").is_none());
75/// ```
76#[must_use]
77pub fn parse_date(input: &str) -> Option<DateTime<Utc>> {
78    let input = input.trim();
79
80    if input.is_empty() {
81        return None;
82    }
83
84    // Try RFC 3339 first (most common in Atom)
85    if let Ok(dt) = DateTime::parse_from_rfc3339(input) {
86        return Some(dt.with_timezone(&Utc));
87    }
88
89    // Try RFC 2822 (RSS pubDate format)
90    if let Ok(dt) = DateTime::parse_from_rfc2822(input) {
91        return Some(dt.with_timezone(&Utc));
92    }
93
94    // Special handling for year-only format (e.g., "2024")
95    if let Ok(year) = input.parse::<i32>()
96        && (1000..=9999).contains(&year)
97    {
98        return NaiveDate::from_ymd_opt(year, 1, 1)
99            .and_then(|d| d.and_hms_opt(0, 0, 0))
100            .map(|dt| dt.and_utc());
101    }
102
103    // Special handling for year-month format (e.g., "2024-12")
104    if input.len() == 7
105        && input.chars().nth(4) == Some('-')
106        && let (Ok(year), Ok(month)) = (input[..4].parse::<i32>(), input[5..7].parse::<u32>())
107        && (1000..=9999).contains(&year)
108        && (1..=12).contains(&month)
109    {
110        return NaiveDate::from_ymd_opt(year, month, 1)
111            .and_then(|d| d.and_hms_opt(0, 0, 0))
112            .map(|dt| dt.and_utc());
113    }
114
115    // Try all format strings
116    for fmt in DATE_FORMATS {
117        // Try parsing with time component
118        if let Ok(dt) = NaiveDateTime::parse_from_str(input, fmt) {
119            return Some(dt.and_utc());
120        }
121
122        // Try parsing date-only, assume midnight UTC
123        if let Ok(date) = NaiveDate::parse_from_str(input, fmt) {
124            return date.and_hms_opt(0, 0, 0).map(|dt| dt.and_utc());
125        }
126    }
127
128    // Could not parse
129    None
130}
131
132#[cfg(test)]
133mod tests {
134    use super::*;
135    use chrono::{Datelike, Timelike};
136
137    #[test]
138    fn test_rfc3339_with_timezone() {
139        let dt = parse_date("2024-12-14T10:30:00+00:00");
140        assert!(dt.is_some());
141        let dt = dt.unwrap();
142        assert_eq!(dt.year(), 2024);
143        assert_eq!(dt.month(), 12);
144        assert_eq!(dt.day(), 14);
145        assert_eq!(dt.hour(), 10);
146        assert_eq!(dt.minute(), 30);
147    }
148
149    #[test]
150    fn test_rfc3339_z_suffix() {
151        let dt = parse_date("2024-12-14T10:30:00Z");
152        assert!(dt.is_some());
153        assert_eq!(dt.unwrap().year(), 2024);
154    }
155
156    #[test]
157    fn test_rfc3339_with_milliseconds() {
158        let dt = parse_date("2024-12-14T10:30:00.123Z");
159        assert!(dt.is_some());
160    }
161
162    #[test]
163    fn test_rfc2822_format() {
164        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 +0000");
165        assert!(dt.is_some());
166        let dt = dt.unwrap();
167        assert_eq!(dt.year(), 2024);
168        assert_eq!(dt.month(), 12);
169    }
170
171    #[test]
172    fn test_rfc2822_gmt() {
173        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 GMT");
174        assert!(dt.is_some());
175    }
176
177    #[test]
178    fn test_iso8601_date_only() {
179        let dt = parse_date("2024-12-14");
180        assert!(dt.is_some());
181        let dt = dt.unwrap();
182        assert_eq!(dt.year(), 2024);
183        assert_eq!(dt.month(), 12);
184        assert_eq!(dt.day(), 14);
185        assert_eq!(dt.hour(), 0); // Midnight
186    }
187
188    #[test]
189    fn test_us_format_long_month() {
190        let dt = parse_date("December 14, 2024");
191        assert!(dt.is_some());
192    }
193
194    #[test]
195    fn test_us_format_short_month() {
196        let dt = parse_date("Dec 14, 2024");
197        assert!(dt.is_some());
198    }
199
200    #[test]
201    fn test_invalid_date() {
202        let dt = parse_date("not a date");
203        assert!(dt.is_none());
204    }
205
206    #[test]
207    fn test_empty_string() {
208        let dt = parse_date("");
209        assert!(dt.is_none());
210    }
211
212    #[test]
213    fn test_whitespace_only() {
214        let dt = parse_date("   ");
215        assert!(dt.is_none());
216    }
217
218    #[test]
219    fn test_partial_date_invalid() {
220        // Invalid partial dates should fail
221        let dt = parse_date("2024-13"); // Invalid month
222        assert!(dt.is_none());
223        let dt = parse_date("abcd-12");
224        assert!(dt.is_none());
225    }
226
227    #[test]
228    fn test_us_date_slash_format() {
229        let dt = parse_date("12/14/2024");
230        assert!(dt.is_some());
231    }
232
233    #[test]
234    fn test_eu_date_dot_format() {
235        let dt = parse_date("14.12.2024");
236        assert!(dt.is_some());
237    }
238
239    #[test]
240    fn test_rfc822_without_day() {
241        let dt = parse_date("14 Dec 2024");
242        assert!(dt.is_some());
243    }
244
245    #[test]
246    fn test_rfc822_long_month() {
247        let dt = parse_date("14 December 2024");
248        assert!(dt.is_some());
249    }
250
251    #[test]
252    fn test_year_slash_format() {
253        let dt = parse_date("2024/12/14");
254        assert!(dt.is_some());
255    }
256
257    #[test]
258    fn test_dash_month_format() {
259        let dt = parse_date("14-Dec-2024");
260        assert!(dt.is_some());
261    }
262
263    #[test]
264    fn test_us_dash_format() {
265        let dt = parse_date("12-14-2024");
266        assert!(dt.is_some());
267    }
268
269    #[test]
270    fn test_eu_slash_with_time() {
271        let dt = parse_date("14/12/2024 10:30:45");
272        assert!(dt.is_some());
273    }
274
275    #[test]
276    fn test_multiple_formats_dont_panic() {
277        let dates = vec![
278            "2024-12-14T10:30:00Z",
279            "Sat, 14 Dec 2024 10:30:00 GMT",
280            "14 Dec 2024",
281            "December 14, 2024",
282            "12/14/2024",
283            "14.12.2024",
284            "2024/12/14",
285            "14-Dec-2024",
286            "not a date",
287            "",
288            "2024",
289            "12/2024",
290        ];
291
292        for date_str in dates {
293            let _ = parse_date(date_str);
294        }
295    }
296
297    #[test]
298    fn test_edge_case_leap_year() {
299        let dt = parse_date("2024-02-29");
300        assert!(dt.is_some());
301    }
302
303    #[test]
304    fn test_edge_case_invalid_date() {
305        let dt = parse_date("2023-02-29");
306        assert!(dt.is_none());
307    }
308
309    #[test]
310    fn test_year_only_format() {
311        let dt = parse_date("2024").unwrap();
312        assert_eq!(dt.year(), 2024);
313        assert_eq!(dt.month(), 1);
314        assert_eq!(dt.day(), 1);
315        assert_eq!(dt.hour(), 0);
316    }
317
318    #[test]
319    fn test_year_month_format() {
320        let dt = parse_date("2024-12").unwrap();
321        assert_eq!(dt.year(), 2024);
322        assert_eq!(dt.month(), 12);
323        assert_eq!(dt.day(), 1);
324        assert_eq!(dt.hour(), 0);
325    }
326
327    #[test]
328    fn test_all_new_formats() {
329        let test_cases = vec![("2024", 2024, 1, 1), ("2024-12", 2024, 12, 1)];
330
331        for (date_str, year, month, day) in test_cases {
332            let dt = parse_date(date_str).unwrap_or_else(|| panic!("Failed to parse: {date_str}"));
333            assert_eq!(dt.year(), year, "Year mismatch for: {date_str}");
334            assert_eq!(dt.month(), month, "Month mismatch for: {date_str}");
335            assert_eq!(dt.day(), day, "Day mismatch for: {date_str}");
336        }
337    }
338}