rss2email_lib/xml/
rss.rs

1//! [Specification](https://www.rssboard.org/rss-specification)
2//!
3//! ```xml
4//! <rss>
5//!   <channel>
6//!     <title></title>
7//!     <lastBuildDate>RFC 2822</lastBuildDate>
8//!     <pubDate>RFC 2822</pubDate>
9//!     <item>
10//!       <title></title>
11//!       <link></link>
12//!       <pubDate>RFC 2822</pubDate>
13//!       <description></description>?
14//!     </item>
15//!   </channel>
16//! </rss>
17//! ```
18
19use chrono::{DateTime, FixedOffset, Utc};
20use quick_xml::DeError;
21use regex::Regex;
22use serde_derive::{Deserialize, Serialize};
23
24use crate::{
25  blog::{Blog, Post},
26  warn,
27};
28
29use super::{
30  limit_description,
31  traits::{BlogPost, WebFeed},
32  ParserError,
33};
34
35#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
36#[serde(rename = "rss")]
37pub struct RssFeed {
38  pub channel: Channel,
39}
40
41#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
42#[serde(rename_all = "camelCase")]
43pub struct Channel {
44  pub title: String,
45  pub last_build_date: Option<String>,
46  pub pub_date: Option<String>,
47  #[serde(rename = "item", default)]
48  pub items: Vec<RssPost>,
49}
50
51#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
52#[serde(rename_all = "camelCase")]
53#[serde(rename = "item")]
54pub struct RssPost {
55  // Link and title can be omitted, according to spec, provided that there is a description
56  // https://www.rssboard.org/rss-specification#hrelementsOfLtitemgt
57  pub title: Option<String>,
58  pub link: Option<String>,
59  pub description: Option<String>,
60  pub pub_date: Option<String>,
61}
62
63impl WebFeed for Result<RssFeed, DeError> {
64  fn into_blog(self) -> Result<Blog, ParserError> {
65    let feed = self?;
66    let title = feed.channel.title;
67    let site_last_build_date = feed.channel.pub_date;
68    let items = feed.channel.items;
69    let last_post_build_date = items.first().and_then(|x| x.clone().pub_date);
70
71    let last_build_date = site_last_build_date
72      .or(last_post_build_date)
73      .ok_or_else(|| ParserError::Parse("Date not found.".to_owned()))?;
74
75    let posts: Vec<Post> = items
76      .iter()
77      // TODO Turn this into a method
78      .filter_map(|x| match x.clone().into_post() {
79        Ok(post) => Some(post),
80        Err(e) => {
81          warn!(
82            "\"{}\"'s post titled \"{}\" errored with '{}'",
83            title,
84            x.title
85              .as_ref()
86              .map_or_else(|| "n/a".to_string(), Clone::clone),
87            e
88          );
89          None
90        }
91      })
92      .collect();
93
94    let last_build_date = parse_date_helper(&last_build_date)?;
95
96    Ok(Blog {
97      title,
98      most_recent_pub_date: last_build_date.with_timezone(&Utc),
99      posts,
100    })
101  }
102}
103
104impl BlogPost for RssPost {
105  fn into_post(self) -> Result<Post, ParserError> {
106    let Some(link) = self.link else {
107      return Err(ParserError::Parse("No link in post".to_string()));
108    };
109
110    let (title, description) = match (
111      self.title,
112      self.description.map(|desc| limit_description(&desc, 200)),
113    ) {
114      (Some(link), description) => (link, description),
115      (None, None) => (link.clone(), None),
116      (None, Some(description)) => {
117        if description.len() > 50 {
118          (limit_description(&description, 50), Some(description))
119        } else {
120          (description, None)
121        }
122      }
123    };
124
125    let pub_date = self
126      .pub_date
127      .ok_or_else(|| ParserError::Parse("Date not found.".to_owned()))?;
128
129    let last_build_date = parse_date_helper(&pub_date)?;
130
131    Ok(Post {
132      title,
133      link,
134      description,
135      pub_date: last_build_date.with_timezone(&Utc),
136    })
137  }
138}
139
140/// Helper method that first tries to parse a date using [`DateTime::parse_from_rfc2822`]
141/// and if that fails, it tries with [`parse_from_rfc822`].
142fn parse_date_helper(date: &str) -> Result<DateTime<FixedOffset>, ParserError> {
143  if date.is_empty() {
144    return Err(ParserError::empty_date_error());
145  }
146
147  DateTime::parse_from_rfc2822(date).or_else(|_| parse_from_rfc822(date))
148}
149
150/// Tries to parse [`RFC822`](https://www.w3.org/Protocols/rfc822/#z28). This is a very much *not*
151/// complete solution since very few timezones are currently supported (see [`tz_to_offset`])
152/// but it works for now and it is not used frequently. I will be updating it whenever I find
153/// feeds that break it.
154///
155/// See [issue](https://github.com/AntoniosBarotsis/Rss2Email/issues/34).
156fn parse_from_rfc822(date: &str) -> Result<DateTime<FixedOffset>, ParserError> {
157  let format_str = "%d %b %y %H:%M";
158
159  // See https://regex101.com/r/hHU76d/1 and https://www.w3.org/Protocols/rfc822/#z28
160  // (military timezones are not supported by this regular expression).
161  // Idea is to have a digit followed by an optional space
162  // followed by a 2 or 3 letter time zone.
163  let regex = Regex::new(r"\d\s?([a-zA-Z]{2,3}$)").expect("Invalid regex");
164
165  let cap = regex
166    .captures(date)
167    .and_then(|x| x.get(1))
168    .ok_or_else(|| ParserError::timezone_date_error("Timezone not found".to_owned()))?
169    .as_str();
170
171  let date = regex.replace_all(date, "").to_string();
172
173  let tz = tz_to_offset(cap)?;
174
175  DateTime::parse_from_str(&date, format_str)
176    .map(|dt| dt.with_timezone(&tz))
177    .map_err(|e| ParserError::generic_date_error(format!("Error parsing date '{date}' ({e})")))
178}
179
180/// Maps timezones from Strings to [`FixedOffset`]s
181fn tz_to_offset(tz: &str) -> Result<FixedOffset, ParserError> {
182  match tz {
183    "UTC" => Ok(FixedOffset::east_opt(0).expect("FixedOffset::east out of bounds")),
184    _ => Err(ParserError::timezone_date_error(format!(
185      "Unknown timezone {tz}, please open an issue!"
186    ))),
187  }
188}