use kuchiki::traits::*;
use std::fmt;
pub use url::Url;
const MIGHT_BE_FEED: [&str; 4] = ["feed", "xml", "rss", "atom"];
#[derive(Debug, PartialEq)]
pub enum FeedFinderError {
Url(url::ParseError),
Select,
}
#[derive(Debug, PartialEq)]
pub enum FeedType {
Rss,
Atom,
Json,
Link,
Guess,
}
#[derive(Debug, PartialEq)]
pub struct Feed {
url: Url,
type_: FeedType,
title: Option<String>,
}
type FeedResult = Result<Vec<Feed>, FeedFinderError>;
struct FeedFinder<'a> {
doc: kuchiki::NodeRef,
base_url: &'a Url,
}
pub fn detect_feeds(base_url: &Url, html: &str) -> FeedResult {
let finder = FeedFinder {
doc: kuchiki::parse_html().one(html),
base_url,
};
let sources = [
FeedFinder::meta_links,
FeedFinder::youtube,
FeedFinder::body_links,
FeedFinder::guess,
];
for source in &sources {
let candidates = source(&finder)?;
if !candidates.is_empty() {
return Ok(candidates);
}
}
Ok(Vec::new())
}
fn nth_path_segment(url: &Url, nth: usize) -> Option<&str> {
url.path_segments()
.and_then(|mut segments| segments.nth(nth))
}
impl<'a> FeedFinder<'a> {
fn meta_links(&self) -> FeedResult {
let mut feeds = vec![];
for link in self
.doc
.select("link[rel='alternate']")
.map_err(|_| FeedFinderError::Select)?
{
let attrs = link.attributes.borrow();
let title = attrs.get("title").map(|title| title.to_owned());
match (attrs.get("type"), attrs.get("href")) {
(Some("application/rss+xml"), Some(href)) => feeds.push(Feed {
url: self.base_url.join(href).map_err(FeedFinderError::Url)?,
type_: FeedType::Rss,
title,
}),
(Some("application/atom+xml"), Some(href)) => feeds.push(Feed {
url: self.base_url.join(href).map_err(FeedFinderError::Url)?,
type_: FeedType::Atom,
title,
}),
(Some("application/json"), Some(href)) => feeds.push(Feed {
url: self.base_url.join(href).map_err(FeedFinderError::Url)?,
type_: FeedType::Json,
title,
}),
_ => (),
}
}
Ok(feeds)
}
fn youtube(&self) -> FeedResult {
let mut feeds = vec![];
let url = self.base_url.as_str();
if url.starts_with("https://www.youtube.com/channel/") {
if let Some(id) = nth_path_segment(self.base_url, 1) {
let feed = Url::parse(&format!(
"https://www.youtube.com/feeds/videos.xml?channel_id={}",
id
))
.map_err(FeedFinderError::Url)?;
feeds.push(Feed {
url: feed,
type_: FeedType::Atom,
title: None,
});
}
} else if url.starts_with("https://www.youtube.com/user/") {
if let Some(id) = nth_path_segment(self.base_url, 1) {
let feed = Url::parse(&format!(
"https://www.youtube.com/feeds/videos.xml?user={}",
id
))
.map_err(FeedFinderError::Url)?;
feeds.push(Feed {
url: feed,
type_: FeedType::Atom,
title: None,
});
}
} else if url.starts_with("https://www.youtube.com/playlist?list=")
|| url.starts_with("https://www.youtube.com/watch")
{
for (key, value) in self.base_url.query_pairs() {
if key == "list" {
let feed = Url::parse(&format!(
"https://www.youtube.com/feeds/videos.xml?playlist_id={}",
value
))
.map_err(FeedFinderError::Url)?;
feeds.push(Feed {
url: feed,
type_: FeedType::Atom,
title: None,
});
break;
}
}
}
Ok(feeds)
}
fn body_links(&self) -> FeedResult {
let mut feeds = vec![];
for a in self.doc.select("a").map_err(|_| FeedFinderError::Select)? {
let attrs = a.attributes.borrow();
if let Some(href) = attrs.get("href") {
if MIGHT_BE_FEED.iter().any(|hint| href.contains(hint)) {
feeds.push(Feed {
url: self.base_url.join(href).map_err(FeedFinderError::Url)?,
type_: FeedType::Link,
title: None,
})
}
}
}
Ok(feeds)
}
fn guess_segments(&self, feed_file: &str) -> FeedResult {
let mut feeds = Vec::new();
if let Some(segments) = self.base_url.path_segments() {
let mut remaining_segments = segments.collect::<Vec<_>>();
let mut segments = vec!["", feed_file];
loop {
let url = self
.base_url
.join(&segments.join("/"))
.map_err(FeedFinderError::Url)?;
feeds.push(Feed {
url,
type_: FeedType::Guess,
title: None,
});
if remaining_segments.is_empty() {
break;
}
let index = segments.len() - 1;
let segment = remaining_segments.remove(0);
if segment.is_empty() {
break;
}
segments.insert(index, segment);
}
}
Ok(feeds)
}
fn guess(&self) -> FeedResult {
let markup = self.doc.to_string().to_lowercase();
let url = if markup.contains("tumblr.com") {
Some(self.base_url.join("/rss").map_err(FeedFinderError::Url)?)
} else if markup.contains("wordpress") {
Some(self.base_url.join("/feed").map_err(FeedFinderError::Url)?)
} else if markup.contains("hugo") {
return self.guess_segments("index.xml");
} else if markup.contains("jekyll")
|| self
.base_url
.host_str()
.map(|host| host.ends_with("github.io"))
.unwrap_or(false)
{
return self.guess_segments("atom.xml");
} else if markup.contains("ghost") {
Some(self.base_url.join("/rss/").map_err(FeedFinderError::Url)?)
} else {
None
};
Ok(url
.map(|url| {
vec![Feed {
url,
type_: FeedType::Guess,
title: None,
}]
})
.unwrap_or_else(Vec::new))
}
}
impl Feed {
pub fn url(&self) -> &Url {
&self.url
}
pub fn feed_type(&self) -> &FeedType {
&self.type_
}
pub fn title(&self) -> Option<&str> {
self.title.as_deref()
}
}
impl fmt::Display for FeedFinderError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FeedFinderError::Url(err) => err.fmt(f),
FeedFinderError::Select => f.write_str("unable to select elements in doc"),
}
}
}
impl std::error::Error for FeedFinderError {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_meta_atom() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><link rel="alternate" type="application/atom+xml" href="http://example.com/feed.atom"></head></html>"#;
let url = Url::parse("http://example.com/feed.atom").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Atom,
title: None
},])
);
}
#[test]
fn test_detect_meta_rss() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><link rel="alternate" type="application/rss+xml" href="http://example.com/feed.rss"></head></html>"#;
let url = Url::parse("http://example.com/feed.rss").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Rss,
title: None
},])
);
}
#[test]
fn test_detect_meta_rss_title() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><link rel="alternate" type="application/rss+xml" href="http://example.com/feed.rss" title="RSS Feed"></head></html>"#;
let url = Url::parse("http://example.com/feed.rss").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Rss,
title: Some(String::from("RSS Feed"))
},])
);
}
#[test]
fn test_detect_meta_rss_title_multiple() {
let base = Url::parse("https://wordpress.com/blog/2021/12/07/drive-more-traffic-to-your-site-with-a-link-in-bio-social-links-page/").unwrap();
let html = r#"<html><head>
<link rel="alternate" type="application/rss+xml" title="WordPress.com Blog" href="https://wordpress.com/blog/feed/">
<link rel="alternate" type="application/rss+xml" title="WordPress.com News » Drive More Traffic To Your Site With a “Link In Bio” Social Links Page Comments Feed" href="https://wordpress.com/blog/2021/12/07/drive-more-traffic-to-your-site-with-a-link-in-bio-social-links-page/feed/">
</head></html>"#;
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url: "https://wordpress.com/blog/feed/".parse().unwrap(),
type_: FeedType::Rss,
title: Some(String::from("WordPress.com Blog"))
},
Feed {
url: "https://wordpress.com/blog/2021/12/07/drive-more-traffic-to-your-site-with-a-link-in-bio-social-links-page/feed/".parse().unwrap(),
type_: FeedType::Rss,
title: Some(String::from("WordPress.com News » Drive More Traffic To Your Site With a “Link In Bio” Social Links\u{a0}Page Comments Feed"))
},])
);
}
#[test]
fn test_detect_meta_rss_relative() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><link rel="alternate" type="application/rss+xml" href="/feed.rss"></head></html>"#;
let url = Url::parse("http://example.com/feed.rss").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Rss,
title: None
},])
);
}
#[test]
fn test_detect_meta_json_feed() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><link rel="alternate" type="application/json" href="http://example.com/feed.json"></head></html>"#;
let url = Url::parse("http://example.com/feed.json").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Json,
title: None
},])
);
}
#[test]
fn test_body_link_feed() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><body><a href="/feed/">RSS</a></body</html>"#;
let url = Url::parse("http://example.com/feed/").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Link,
title: None
},])
);
}
#[test]
fn test_body_link_xml() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><body><a href="/index.xml">RSS</a></body</html>"#;
let url = Url::parse("http://example.com/index.xml").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Link,
title: None
},])
);
}
#[test]
fn test_body_link_rss() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><body><a href="/comments.rss">RSS</a></body</html>"#;
let url = Url::parse("http://example.com/comments.rss").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Link,
title: None
},])
);
}
#[test]
fn test_body_link_atom() {
let base = Url::parse("http://example.com/").unwrap();
let html =
r#"<html><body><a href="http://other.example.com/posts.atom">RSS</a></body</html>"#;
let url = Url::parse("http://other.example.com/posts.atom").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Link,
title: None
},])
);
}
#[test]
fn test_guess_tumblr() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><link href="http://static.tumblr.com/example/jquery.fancybox-1.3.4.css" rel="stylesheet" type="text/css"></head><body>First post!</body</html>"#;
let url = Url::parse("http://example.com/rss").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Guess,
title: None
},])
);
}
#[test]
fn test_guess_wordpress() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><meta name="generator" content="WordPress.com" /></head><body>First post!</body</html>"#;
let url = Url::parse("http://example.com/feed").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Guess,
title: None
},])
);
}
#[test]
fn test_guess_hugo() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><meta name="generator" content="Hugo 0.27.1" /></head><body>First post!</body</html>"#;
let url = Url::parse("http://example.com/index.xml").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Guess,
title: None
},])
);
}
#[test]
fn test_guess_jekyll() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head></head><body><!-- Begin Jekyll SEO tag v2.3.0 -->First post!</body</html>"#;
let url = Url::parse("http://example.com/atom.xml").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Guess,
title: None
},])
);
}
#[test]
fn test_guess_github_io() {
let base = Url::parse("http://example.github.io/").unwrap();
let html = r#"<html><head></head><body>First post!</body</html>"#;
let url = Url::parse("http://example.github.io/atom.xml").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Guess,
title: None
},])
);
}
#[test]
fn test_guess_ghost() {
let base = Url::parse("http://example.com/").unwrap();
let html = r#"<html><head><meta name="generator" content="Ghost 1.21" /></head><body>First post!</body</html>"#;
let url = Url::parse("http://example.com/rss/").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Guess,
title: None
},])
);
}
#[test]
fn test_guess_hugo_non_root() {
let base = Url::parse("http://example.com/blog/post/").unwrap();
let html = r#"<html><head><meta name="generator" content="Hugo 0.27.1" /></head><body>First post!</body</html>"#;
assert_eq!(
detect_feeds(&base, html),
Ok(vec![
Feed {
url: Url::parse("http://example.com/index.xml").unwrap(),
type_: FeedType::Guess,
title: None
},
Feed {
url: Url::parse("http://example.com/blog/index.xml").unwrap(),
type_: FeedType::Guess,
title: None
},
Feed {
url: Url::parse("http://example.com/blog/post/index.xml").unwrap(),
type_: FeedType::Guess,
title: None
},
])
);
}
#[test]
fn test_guess_jekyll_non_root() {
let base = Url::parse("http://example.github.io/blog/post/").unwrap();
let html = r#"<html><head></head><body>First post!</body</html>"#;
assert_eq!(
detect_feeds(&base, html),
Ok(vec![
Feed {
url: Url::parse("http://example.github.io/atom.xml").unwrap(),
type_: FeedType::Guess,
title: None
},
Feed {
url: Url::parse("http://example.github.io/blog/atom.xml").unwrap(),
type_: FeedType::Guess,
title: None
},
Feed {
url: Url::parse("http://example.github.io/blog/post/atom.xml").unwrap(),
type_: FeedType::Guess,
title: None
},
])
);
}
#[test]
fn test_youtube_channel() {
let base = Url::parse("https://www.youtube.com/channel/UCaYhcUwRBNscFNUKTjgPFiA").unwrap();
let html = r#"<html><head></head><body>YouTube</body</html>"#;
let url = Url::parse(
"https://www.youtube.com/feeds/videos.xml?channel_id=UCaYhcUwRBNscFNUKTjgPFiA",
)
.unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Atom,
title: None
},])
);
}
#[test]
fn test_youtube_user() {
let base = Url::parse("https://www.youtube.com/user/wezmnet").unwrap();
let html = r#"<html><head></head><body>YouTube</body</html>"#;
let url = Url::parse("https://www.youtube.com/feeds/videos.xml?user=wezmnet").unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Atom,
title: None
},])
);
}
#[test]
fn test_youtube_playlist() {
let base =
Url::parse("https://www.youtube.com/playlist?list=PLTOeCUgrkpMNEHx6j0vCH0cuyAIVZadnc")
.unwrap();
let html = r#"<html><head></head><body>YouTube</body</html>"#;
let url = Url::parse(
"https://www.youtube.com/feeds/videos.xml?playlist_id=PLTOeCUgrkpMNEHx6j0vCH0cuyAIVZadnc",
).unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Atom,
title: None
},])
);
}
#[test]
fn test_youtube_watch_playlist() {
let base =
Url::parse("https://www.youtube.com/watch?v=0gjFYpvHyrY&list=FLOEg2K4TcePNx9SdGdR0zpg")
.unwrap();
let html = r#"<html><head></head><body>YouTube</body</html>"#;
let url = Url::parse(
"https://www.youtube.com/feeds/videos.xml?playlist_id=FLOEg2K4TcePNx9SdGdR0zpg",
)
.unwrap();
assert_eq!(
detect_feeds(&base, html),
Ok(vec![Feed {
url,
type_: FeedType::Atom,
title: None
},])
);
}
}