use chrono::{DateTime, FixedOffset, Utc};
use quick_xml::DeError;
use regex::Regex;
use serde_derive::{Deserialize, Serialize};
use crate::{
blog::{Blog, Post},
warn,
};
use super::{
limit_description,
traits::{BlogPost, WebFeed},
ParserError,
};
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(rename = "rss")]
pub struct RssFeed {
pub channel: Channel,
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Channel {
pub title: String,
pub last_build_date: Option<String>,
pub pub_date: Option<String>,
#[serde(rename = "item", default)]
pub items: Vec<RssPost>,
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(rename_all = "camelCase")]
#[serde(rename = "item")]
pub struct RssPost {
pub title: Option<String>,
pub link: Option<String>,
pub description: Option<String>,
pub pub_date: Option<String>,
}
impl WebFeed for Result<RssFeed, DeError> {
fn into_blog(self) -> Result<Blog, ParserError> {
let feed = self?;
let title = feed.channel.title;
let site_last_build_date = feed.channel.pub_date;
let items = feed.channel.items;
let last_post_build_date = items.first().and_then(|x| x.clone().pub_date);
let last_build_date = site_last_build_date
.or(last_post_build_date)
.ok_or_else(|| ParserError::Parse("Date not found.".to_owned()))?;
let posts: Vec<Post> = items
.iter()
.filter_map(|x| match x.clone().into_post() {
Ok(post) => Some(post),
Err(e) => {
warn!(
"\"{}\"'s post titled \"{}\" errored with '{}'",
title,
x.title
.as_ref()
.map_or_else(|| "n/a".to_string(), std::clone::Clone::clone),
e
);
None
}
})
.collect();
let last_build_date = parse_date_helper(&last_build_date)?;
Ok(Blog {
title,
most_recent_pub_date: last_build_date.with_timezone(&Utc),
posts,
})
}
}
impl BlogPost for RssPost {
fn into_post(self) -> Result<Post, ParserError> {
let Some(link) = self.link else {
return Err(ParserError::Parse("No link in post".to_string()));
};
let (title, description) = match (
self.title,
self.description.map(|desc| limit_description(&desc, 200)),
) {
(Some(link), description) => (link, description),
(None, None) => (link.clone(), None),
(None, Some(description)) => {
if description.len() > 50 {
(limit_description(&description, 50), Some(description))
} else {
(description, None)
}
}
};
let pub_date = self
.pub_date
.ok_or_else(|| ParserError::Parse("Date not found.".to_owned()))?;
let last_build_date = parse_date_helper(&pub_date)?;
Ok(Post {
title,
link,
description,
pub_date: last_build_date.with_timezone(&Utc),
})
}
}
fn parse_date_helper(date: &str) -> Result<DateTime<FixedOffset>, ParserError> {
if date.is_empty() {
return Err(ParserError::empty_date_error());
}
DateTime::parse_from_rfc2822(date).or_else(|_| parse_from_rfc822(date))
}
fn parse_from_rfc822(date: &str) -> Result<DateTime<FixedOffset>, ParserError> {
let format_str = "%d %b %y %H:%M";
let regex = Regex::new(r"\d\s?([a-zA-Z]{2,3}$)").expect("Invalid regex");
let cap = regex
.captures(date)
.and_then(|x| x.get(1))
.ok_or_else(|| ParserError::timezone_date_error("Timezone not found".to_owned()))?
.as_str();
let date = regex.replace_all(date, "").to_string();
let tz = tz_to_offset(cap)?;
DateTime::parse_from_str(&date, format_str)
.map(|dt| dt.with_timezone(&tz))
.map_err(|e| ParserError::generic_date_error(format!("Error parsing date '{date}' ({e})")))
}
fn tz_to_offset(tz: &str) -> Result<FixedOffset, ParserError> {
match tz {
"UTC" => Ok(FixedOffset::east_opt(0).expect("FixedOffset::east out of bounds")),
_ => Err(ParserError::timezone_date_error(format!(
"Unknown timezone {tz}, please open an issue!"
))),
}
}