feedparser_rs/util/
date.rs

1//! Multi-format date parsing for RSS and Atom feeds
2
3use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
4
5/// Date format strings to try, in priority order
6///
7/// Order matters: more specific formats first, then fallbacks
8const DATE_FORMATS: &[&str] = &[
9    // ISO 8601 / RFC 3339 variants (Atom)
10    "%Y-%m-%dT%H:%M:%S%.f%:z", // 2024-12-14T10:30:45.123+00:00
11    "%Y-%m-%dT%H:%M:%S%:z",    // 2024-12-14T10:30:45+00:00
12    "%Y-%m-%dT%H:%M:%S%.fZ",   // 2024-12-14T10:30:45.123Z
13    "%Y-%m-%dT%H:%M:%SZ",      // 2024-12-14T10:30:45Z
14    "%Y-%m-%dT%H:%M:%S",       // 2024-12-14T10:30:45 (no timezone)
15    "%Y-%m-%d %H:%M:%S",       // 2024-12-14 10:30:45
16    "%Y-%m-%d",                // 2024-12-14
17    // W3C Date-Time variants
18    "%Y-%m-%d %H:%M:%S%:z", // 2024-12-14 10:30:45+00:00
19    "%Y/%m/%d %H:%M:%S",    // 2024/12/14 10:30:45
20    "%Y/%m/%d",             // 2024/12/14
21    // RFC 822 variants (RSS pubDate)
22    "%d %b %Y %H:%M:%S", // 14 Dec 2024 10:30:45
23    "%d %b %Y",          // 14 Dec 2024
24    "%d %B %Y %H:%M:%S", // 14 December 2024 10:30:45
25    "%d %B %Y",          // 14 December 2024
26    // US date formats
27    "%B %d, %Y %H:%M:%S", // December 14, 2024 10:30:45
28    "%B %d, %Y",          // December 14, 2024
29    "%b %d, %Y %H:%M:%S", // Dec 14, 2024 10:30:45
30    "%b %d, %Y",          // Dec 14, 2024
31    "%m/%d/%Y %H:%M:%S",  // 12/14/2024 10:30:45
32    "%m/%d/%Y",           // 12/14/2024
33    "%m-%d-%Y",           // 12-14-2024
34    // EU date formats
35    "%d.%m.%Y %H:%M:%S", // 14.12.2024 10:30:45
36    "%d.%m.%Y",          // 14.12.2024
37    "%d/%m/%Y %H:%M:%S", // 14/12/2024 10:30:45
38    "%d/%m/%Y",          // 14/12/2024
39    "%d-%b-%Y",          // 14-Dec-2024
40    "%d-%B-%Y",          // 14-December-2024
41];
42
43/// Parse date from string, trying multiple formats
44///
45/// This function attempts to parse dates in the following order:
46/// 1. RFC 3339 (Atom standard: 2024-12-14T10:30:00Z)
47/// 2. RFC 2822 (RSS standard: Sat, 14 Dec 2024 10:30:00 +0000)
48/// 3. Common format strings (ISO 8601 variants, US/EU formats)
49///
50/// # Arguments
51///
52/// * `input` - Date string to parse
53///
54/// # Returns
55///
56/// * `Some(DateTime<Utc>)` - Successfully parsed date
57/// * `None` - Could not parse date
58///
59/// # Examples
60///
61/// ```
62/// use feedparser_rs::util::date::parse_date;
63///
64/// // RFC 3339 (Atom)
65/// assert!(parse_date("2024-12-14T10:30:00Z").is_some());
66///
67/// // RFC 2822 (RSS)
68/// assert!(parse_date("Sat, 14 Dec 2024 10:30:00 +0000").is_some());
69///
70/// // ISO 8601 date-only
71/// assert!(parse_date("2024-12-14").is_some());
72///
73/// // Invalid date
74/// assert!(parse_date("not a date").is_none());
75/// ```
76#[must_use]
77pub fn parse_date(input: &str) -> Option<DateTime<Utc>> {
78    let input = input.trim();
79
80    if input.is_empty() {
81        return None;
82    }
83
84    // Try RFC 3339 first (most common in Atom)
85    if let Ok(dt) = DateTime::parse_from_rfc3339(input) {
86        return Some(dt.with_timezone(&Utc));
87    }
88
89    // Try RFC 2822 (RSS pubDate format)
90    if let Ok(dt) = DateTime::parse_from_rfc2822(input) {
91        return Some(dt.with_timezone(&Utc));
92    }
93
94    // Try all format strings
95    for fmt in DATE_FORMATS {
96        // Try parsing with time component
97        if let Ok(dt) = NaiveDateTime::parse_from_str(input, fmt) {
98            return Some(dt.and_utc());
99        }
100
101        // Try parsing date-only, assume midnight UTC
102        if let Ok(date) = NaiveDate::parse_from_str(input, fmt) {
103            return date.and_hms_opt(0, 0, 0).map(|dt| dt.and_utc());
104        }
105    }
106
107    // Could not parse
108    None
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114    use chrono::{Datelike, Timelike};
115
116    #[test]
117    fn test_rfc3339_with_timezone() {
118        let dt = parse_date("2024-12-14T10:30:00+00:00");
119        assert!(dt.is_some());
120        let dt = dt.unwrap();
121        assert_eq!(dt.year(), 2024);
122        assert_eq!(dt.month(), 12);
123        assert_eq!(dt.day(), 14);
124        assert_eq!(dt.hour(), 10);
125        assert_eq!(dt.minute(), 30);
126    }
127
128    #[test]
129    fn test_rfc3339_z_suffix() {
130        let dt = parse_date("2024-12-14T10:30:00Z");
131        assert!(dt.is_some());
132        assert_eq!(dt.unwrap().year(), 2024);
133    }
134
135    #[test]
136    fn test_rfc3339_with_milliseconds() {
137        let dt = parse_date("2024-12-14T10:30:00.123Z");
138        assert!(dt.is_some());
139    }
140
141    #[test]
142    fn test_rfc2822_format() {
143        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 +0000");
144        assert!(dt.is_some());
145        let dt = dt.unwrap();
146        assert_eq!(dt.year(), 2024);
147        assert_eq!(dt.month(), 12);
148    }
149
150    #[test]
151    fn test_rfc2822_gmt() {
152        let dt = parse_date("Sat, 14 Dec 2024 10:30:00 GMT");
153        assert!(dt.is_some());
154    }
155
156    #[test]
157    fn test_iso8601_date_only() {
158        let dt = parse_date("2024-12-14");
159        assert!(dt.is_some());
160        let dt = dt.unwrap();
161        assert_eq!(dt.year(), 2024);
162        assert_eq!(dt.month(), 12);
163        assert_eq!(dt.day(), 14);
164        assert_eq!(dt.hour(), 0); // Midnight
165    }
166
167    #[test]
168    fn test_us_format_long_month() {
169        let dt = parse_date("December 14, 2024");
170        assert!(dt.is_some());
171    }
172
173    #[test]
174    fn test_us_format_short_month() {
175        let dt = parse_date("Dec 14, 2024");
176        assert!(dt.is_some());
177    }
178
179    #[test]
180    fn test_invalid_date() {
181        let dt = parse_date("not a date");
182        assert!(dt.is_none());
183    }
184
185    #[test]
186    fn test_empty_string() {
187        let dt = parse_date("");
188        assert!(dt.is_none());
189    }
190
191    #[test]
192    fn test_whitespace_only() {
193        let dt = parse_date("   ");
194        assert!(dt.is_none());
195    }
196
197    #[test]
198    fn test_partial_date() {
199        // Should fail gracefully
200        let dt = parse_date("2024-12");
201        assert!(dt.is_none());
202    }
203
204    #[test]
205    fn test_us_date_slash_format() {
206        let dt = parse_date("12/14/2024");
207        assert!(dt.is_some());
208    }
209
210    #[test]
211    fn test_eu_date_dot_format() {
212        let dt = parse_date("14.12.2024");
213        assert!(dt.is_some());
214    }
215
216    #[test]
217    fn test_rfc822_without_day() {
218        let dt = parse_date("14 Dec 2024");
219        assert!(dt.is_some());
220    }
221
222    #[test]
223    fn test_rfc822_long_month() {
224        let dt = parse_date("14 December 2024");
225        assert!(dt.is_some());
226    }
227
228    #[test]
229    fn test_year_slash_format() {
230        let dt = parse_date("2024/12/14");
231        assert!(dt.is_some());
232    }
233
234    #[test]
235    fn test_dash_month_format() {
236        let dt = parse_date("14-Dec-2024");
237        assert!(dt.is_some());
238    }
239
240    #[test]
241    fn test_us_dash_format() {
242        let dt = parse_date("12-14-2024");
243        assert!(dt.is_some());
244    }
245
246    #[test]
247    fn test_eu_slash_with_time() {
248        let dt = parse_date("14/12/2024 10:30:45");
249        assert!(dt.is_some());
250    }
251
252    #[test]
253    fn test_multiple_formats_dont_panic() {
254        let dates = vec![
255            "2024-12-14T10:30:00Z",
256            "Sat, 14 Dec 2024 10:30:00 GMT",
257            "14 Dec 2024",
258            "December 14, 2024",
259            "12/14/2024",
260            "14.12.2024",
261            "2024/12/14",
262            "14-Dec-2024",
263            "not a date",
264            "",
265            "2024",
266            "12/2024",
267        ];
268
269        for date_str in dates {
270            let _ = parse_date(date_str);
271        }
272    }
273
274    #[test]
275    fn test_edge_case_leap_year() {
276        let dt = parse_date("2024-02-29");
277        assert!(dt.is_some());
278    }
279
280    #[test]
281    fn test_edge_case_invalid_date() {
282        let dt = parse_date("2023-02-29");
283        assert!(dt.is_none());
284    }
285}