mod error;
pub use self::error::FeedParserError;
use crate::models::{Feed, FeedID, Url};
use feed_rs::parser;
use kuchikikiki::traits::*;
use mime::Mime;
use reqwest::Client;
use std::{cmp::Ordering, sync::Arc};
use tokio::sync::Semaphore;
#[derive(Clone, Debug)]
pub enum ParsedUrl {
SingleFeed(Box<Feed>),
MultipleFeeds(Vec<Feed>),
}
#[derive(Clone, Copy, Debug, PartialEq)]
enum ContentType {
Rss,
Atom,
JsonFeed,
Html,
}
pub async fn download_and_parse_feed(
url: &Url,
id: &FeedID,
title: Option<String>,
semaphore: Arc<Semaphore>,
client: &Client,
) -> Result<ParsedUrl, FeedParserError> {
let permit = semaphore.acquire().await?;
let result = client
.get(url.as_str())
.send()
.await
.map_err(|error| {
tracing::error!(?error, "get request failed");
FeedParserError::Http(error)
})?
.error_for_status()
.map_err(|error| {
tracing::error!(%error, %url, "Downloading feed failed");
FeedParserError::Http(error)
})?;
let redirected_url = Url::new(result.url().clone());
tracing::debug!(%redirected_url);
let sniffed_content = check_content_type(result.headers().get(reqwest::header::CONTENT_TYPE));
tracing::debug!(?sniffed_content);
let result_string = result
.text()
.await
.inspect_err(|error| tracing::error!(%redirected_url, %error, "Reading response as bytes failed"))?;
if let Some(ContentType::Html) = sniffed_content {
tracing::debug!("ContentType: Html -> trying to parse page for feed url");
if let Ok(feed_vec) = parse_html(&result_string, &redirected_url) {
tracing::debug!(?feed_vec, "feeds");
match feed_vec.len().cmp(&1) {
Ordering::Greater => return Ok(ParsedUrl::MultipleFeeds(feed_vec)),
Ordering::Less => return Err(FeedParserError::Html),
Ordering::Equal => {
if let Some(Feed {
feed_id: _,
label: _,
website: _,
feed_url: Some(url),
icon_url: _,
error_count: _,
error_message: _,
}) = feed_vec.first()
{
let new_result = client.get(url.as_str()).send().await?;
let new_result_string = new_result.text().await?;
return Ok(ParsedUrl::SingleFeed(parse_feed(new_result_string, url, id, title)?));
}
}
}
}
}
drop(permit);
Ok(ParsedUrl::SingleFeed(parse_feed(result_string, &redirected_url, id, title)?))
}
fn parse_html(html: &str, base_url: &Url) -> Result<Vec<Feed>, FeedParserError> {
let document = kuchikikiki::parse_html().one(html);
let nodes = document.select("link[rel=\"alternate\"]").map_err(|_| FeedParserError::Html)?;
let mut result_vec: Vec<Feed> = Vec::new();
for node in nodes {
let attrs = node.attributes.borrow();
let Some(url) = attrs.get("href") else {
tracing::warn!("<link> tag is missin href property");
continue;
};
if url.starts_with("android-app") || url.starts_with("ios-app") {
continue;
}
tracing::debug!(%url, "Parsing Html yielded feed");
let title = match attrs.get("title") {
Some(title) => title,
None => url,
};
let url = match Url::parse(url) {
Ok(url) => url,
Err(_) => match base_url.clone().join(url) {
Ok(url) => Url::new(url),
Err(error) => {
tracing::warn!(%url, %error, "Failed to parse url");
continue;
}
},
};
match node.attributes.borrow().get("type") {
None => continue,
Some(_type) => {
if !_type.contains("rss") && !_type.contains("atom") && !_type.contains("xml") {
continue;
}
}
}
result_vec.push(Feed {
feed_id: FeedID::new(url.as_str()),
label: title.to_string(),
website: None,
feed_url: Some(url),
icon_url: None,
error_count: 0,
error_message: None,
});
}
Ok(result_vec)
}
fn check_content_type(header: Option<&reqwest::header::HeaderValue>) -> Option<ContentType> {
if let Some(header) = header
&& let Ok(header) = header.to_str()
&& let Ok(mime) = header.parse::<Mime>()
{
let essence = mime.essence_str();
if essence == "text/html" {
Some(ContentType::Html)
} else if essence == "application/atom" {
Some(ContentType::Atom)
} else if essence == "text/xml" || essence == "application/rss" {
Some(ContentType::Rss)
} else if essence == "application/vnd.api" && mime.suffix().map(|name| name.as_str()) == Some("json") {
Some(ContentType::JsonFeed)
} else {
None
}
} else {
None
}
}
fn parse_feed(feed: String, url: &Url, id: &FeedID, title: Option<String>) -> Result<Box<Feed>, FeedParserError> {
let parser = parser::Builder::new().base_uri(Some(url)).build();
match parser.parse(feed.as_bytes()) {
Ok(feed) => {
let mut feed = Feed::from_feed_rs(&feed, title, url);
feed.feed_id = id.clone();
Ok(Box::new(feed))
}
Err(error) => {
tracing::error!(%error, "parsing feed failed");
Err(FeedParserError::Feed)
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::ParsedUrl;
use crate::models::{FeedID, Url};
use crate::util::feed_parser;
use reqwest::Client;
use test_log::test;
use tokio::sync::Semaphore;
#[test(tokio::test)]
pub async fn golem_atom() {
let client = reqwest::ClientBuilder::new()
.user_agent("Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0")
.use_native_tls()
.hickory_dns(true)
.gzip(true)
.build()
.unwrap();
let url_text = "https://rss.golem.de/rss.php?feed=ATOM1.0";
let url = Url::parse(url_text).unwrap();
let feed_id = FeedID::new(url_text);
let feed = feed_parser::download_and_parse_feed(&url, &feed_id, None, Arc::new(Semaphore::new(20)), &client)
.await
.unwrap();
let feed = match feed {
ParsedUrl::SingleFeed(feed) => feed,
ParsedUrl::MultipleFeeds(_) => panic!("Expected Single Feed"),
};
assert_eq!(feed.label, "Golem.de");
assert_eq!(feed.icon_url.unwrap().to_string(), "https://www.golem.de/assets/icons/favicon.ico");
assert_eq!(feed.website.unwrap().to_string(), "https://www.golem.de/");
}
#[test(tokio::test)]
pub async fn golem_rss() {
let url_text = "https://rss.golem.de/rss.php?feed=RSS1.0";
let url = Url::parse(url_text).unwrap();
let feed_id = FeedID::new(url_text);
let feed = feed_parser::download_and_parse_feed(&url, &feed_id, None, Arc::new(Semaphore::new(20)), &Client::new())
.await
.unwrap();
let feed = match feed {
ParsedUrl::SingleFeed(feed) => feed,
ParsedUrl::MultipleFeeds(_) => panic!("Expected Single Feed"),
};
assert_eq!(feed.label, "Golem.de");
assert_eq!(feed.website.unwrap().to_string(), "https://www.golem.de/");
}
#[test(tokio::test)]
pub async fn planet_gnome_rss() {
let url_text = "http://planet.gnome.org/rss20.xml";
let url = Url::parse(url_text).unwrap();
let feed_id = FeedID::new(url_text);
let feed = feed_parser::download_and_parse_feed(&url, &feed_id, None, Arc::new(Semaphore::new(20)), &Client::new())
.await
.unwrap();
let feed = match feed {
ParsedUrl::SingleFeed(feed) => feed,
ParsedUrl::MultipleFeeds(_) => panic!("Expected Single Feed"),
};
assert_eq!(feed.label, "Planet GNOME");
assert_eq!(feed.icon_url, None);
assert_eq!(feed.website.unwrap().to_string(), "https://planet.gnome.org/atom.xml");
}
#[test(tokio::test)]
pub async fn theverge_find_feeds() {
let url_text = "https://www.theverge.com/";
let url = Url::parse(url_text).unwrap();
let feed_id = FeedID::new(url_text);
let feed_vec = feed_parser::download_and_parse_feed(&url, &feed_id, None, Arc::new(Semaphore::new(20)), &Client::new())
.await
.unwrap();
let feed = match feed_vec {
ParsedUrl::MultipleFeeds(_) => panic!("Expected single Feed"),
ParsedUrl::SingleFeed(feed) => feed,
};
assert_eq!(feed.label, "The Verge");
assert_eq!(feed.feed_url.unwrap().as_str(), "https://www.theverge.com/rss/index.xml");
}
#[test(tokio::test)]
pub async fn paulstamatiou_find_feeds() {
let url_text = "https://paulstamatiou.com";
let url = Url::parse(url_text).unwrap();
let feed_id = FeedID::new(url_text);
let parsed_feed = feed_parser::download_and_parse_feed(&url, &feed_id, None, Arc::new(Semaphore::new(20)), &Client::new())
.await
.unwrap();
let feed = match parsed_feed {
ParsedUrl::MultipleFeeds(_) => panic!("Expected Single Feed"),
ParsedUrl::SingleFeed(feed) => feed,
};
assert_eq!(feed.label, "Paul Stamatiou");
assert_eq!(feed.feed_url.unwrap().to_string(), "https://paulstamatiou.com/posts.xml");
}
#[test(tokio::test)]
pub async fn dragonsneverforget() {
let url_text = "https://dragonsneverforget.wordpress.com/feed/";
let url = Url::parse(url_text).unwrap();
let feed_id = FeedID::new(url_text);
let parsed_feed = feed_parser::download_and_parse_feed(&url, &feed_id, None, Arc::new(Semaphore::new(20)), &Client::new())
.await
.unwrap();
let feed = match parsed_feed {
ParsedUrl::MultipleFeeds(_) => panic!("Expected Single Feed"),
ParsedUrl::SingleFeed(feed) => feed,
};
assert_eq!(feed.label, "Dragons Never Forget");
assert_eq!(feed.feed_url.unwrap().to_string(), "https://dragonsneverforget.wordpress.com/feed/");
assert_eq!(feed.website.unwrap().to_string(), "https://dragonsneverforget.wordpress.com/");
}
}