1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
//! [Specification](https://www.rssboard.org/rss-specification)
//!
//! ```xml
//! <rss>
//!   <channel>
//!     <title></title>
//!     <lastBuildDate>RFC 2822</lastBuildDate>
//!     <pubDate>RFC 2822</pubDate>
//!     <item>
//!       <title></title>
//!       <link></link>
//!       <pubDate>RFC 2822</pubDate>
//!       <description></description>?
//!     </item>
//!   </channel>
//! </rss>
//! ```

use chrono::{DateTime, FixedOffset, Utc};
use quick_xml::DeError;
use regex::Regex;
use serde_derive::{Deserialize, Serialize};

use crate::{
  blog::{Blog, Post},
  warn,
};

use super::{
  limit_description,
  traits::{BlogPost, WebFeed},
  ParserError,
};

#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(rename = "rss")]
pub struct RssFeed {
  pub channel: Channel,
}

#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Channel {
  pub title: String,
  pub last_build_date: Option<String>,
  pub pub_date: Option<String>,
  #[serde(rename = "item", default)]
  pub items: Vec<RssPost>,
}

#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(rename_all = "camelCase")]
#[serde(rename = "item")]
pub struct RssPost {
  // Link and title can be omitted, according to spec, provided that there is a description
  // https://www.rssboard.org/rss-specification#hrelementsOfLtitemgt
  pub title: Option<String>,
  pub link: Option<String>,
  pub description: Option<String>,
  pub pub_date: Option<String>,
}

impl WebFeed for Result<RssFeed, DeError> {
  fn into_blog(self) -> Result<Blog, ParserError> {
    let feed = self?;
    let title = feed.channel.title;
    let site_last_build_date = feed.channel.pub_date;
    let items = feed.channel.items;
    let last_post_build_date = items.first().and_then(|x| x.clone().pub_date);

    let last_build_date = site_last_build_date
      .or(last_post_build_date)
      .ok_or_else(|| ParserError::Parse("Date not found.".to_owned()))?;

    let posts: Vec<Post> = items
      .iter()
      // TODO Turn this into a method
      .filter_map(|x| match x.clone().into_post() {
        Ok(post) => Some(post),
        Err(e) => {
          warn!(
            "\"{}\"'s post titled \"{}\" errored with '{}'",
            title,
            x.title
              .as_ref()
              .map_or_else(|| "n/a".to_string(), std::clone::Clone::clone),
            e
          );
          None
        }
      })
      .collect();

    let last_build_date = parse_date_helper(&last_build_date)?;

    Ok(Blog {
      title,
      most_recent_pub_date: last_build_date.with_timezone(&Utc),
      posts,
    })
  }
}

impl BlogPost for RssPost {
  fn into_post(self) -> Result<Post, ParserError> {
    let Some(link) = self.link else {
      return Err(ParserError::Parse("No link in post".to_string()));
    };

    let (title, description) = match (
      self.title,
      self.description.map(|desc| limit_description(&desc, 200)),
    ) {
      (Some(link), description) => (link, description),
      (None, None) => (link.clone(), None),
      (None, Some(description)) => {
        if description.len() > 50 {
          (limit_description(&description, 50), Some(description))
        } else {
          (description, None)
        }
      }
    };

    let pub_date = self
      .pub_date
      .ok_or_else(|| ParserError::Parse("Date not found.".to_owned()))?;

    let last_build_date = parse_date_helper(&pub_date)?;

    Ok(Post {
      title,
      link,
      description,
      pub_date: last_build_date.with_timezone(&Utc),
    })
  }
}

/// Helper method that first tries to parse a date using [`DateTime::parse_from_rfc2822`]
/// and if that fails, it tries with [`parse_from_rfc822`].
fn parse_date_helper(date: &str) -> Result<DateTime<FixedOffset>, ParserError> {
  if date.is_empty() {
    return Err(ParserError::empty_date_error());
  }

  DateTime::parse_from_rfc2822(date).or_else(|_| parse_from_rfc822(date))
}

/// Tries to parse [`RFC822`](https://www.w3.org/Protocols/rfc822/#z28). This is a very much *not*
/// complete solution since very few timezones are currently supported (see [`tz_to_offset`])
/// but it works for now and it is not used frequently. I will be updating it whenever I find
/// feeds that break it.
///
/// See [issue](https://github.com/AntoniosBarotsis/Rss2Email/issues/34).
fn parse_from_rfc822(date: &str) -> Result<DateTime<FixedOffset>, ParserError> {
  let format_str = "%d %b %y %H:%M";

  // See https://regex101.com/r/hHU76d/1 and https://www.w3.org/Protocols/rfc822/#z28
  // (military timezones are not supported by this regular expression).
  // Idea is to have a digit followed by an optional space
  // followed by a 2 or 3 letter time zone.
  let regex = Regex::new(r"\d\s?([a-zA-Z]{2,3}$)").expect("Invalid regex");

  let cap = regex
    .captures(date)
    .and_then(|x| x.get(1))
    .ok_or_else(|| ParserError::timezone_date_error("Timezone not found".to_owned()))?
    .as_str();

  let date = regex.replace_all(date, "").to_string();

  let tz = tz_to_offset(cap)?;

  DateTime::parse_from_str(&date, format_str)
    .map(|dt| dt.with_timezone(&tz))
    .map_err(|e| ParserError::generic_date_error(format!("Error parsing date '{date}' ({e})")))
}

/// Maps timezones from Strings to [`FixedOffset`]s
fn tz_to_offset(tz: &str) -> Result<FixedOffset, ParserError> {
  match tz {
    "UTC" => Ok(FixedOffset::east_opt(0).expect("FixedOffset::east out of bounds")),
    _ => Err(ParserError::timezone_date_error(format!(
      "Unknown timezone {tz}, please open an issue!"
    ))),
  }
}