use crate::client::FetchOptions;
use crate::error::FetchError;
use crate::fetchers::Fetcher;
use crate::types::{FetchRequest, FetchResponse};
use crate::DEFAULT_USER_AGENT;
use async_trait::async_trait;
use reqwest::header::{HeaderValue, ACCEPT, USER_AGENT};
use std::time::Duration;
use url::Url;
const API_TIMEOUT: Duration = Duration::from_secs(10);
const MAX_ENTRIES: usize = 20;
pub struct RSSFeedFetcher;
impl RSSFeedFetcher {
pub fn new() -> Self {
Self
}
fn is_feed_url(url: &Url) -> bool {
let path = url.path().to_lowercase();
path.ends_with("/feed")
|| path.ends_with("/feed/")
|| path.ends_with("/rss")
|| path.ends_with("/rss/")
|| path.ends_with("/atom")
|| path.ends_with("/atom/")
|| path.ends_with("/rss.xml")
|| path.ends_with("/atom.xml")
|| path.ends_with("/feed.xml")
|| path.ends_with("/index.xml")
|| path.ends_with("/feed.rss")
|| path.ends_with("/feed.atom")
|| path.ends_with(".rss")
|| path == "/rss"
|| path == "/feed"
}
}
impl Default for RSSFeedFetcher {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Fetcher for RSSFeedFetcher {
fn name(&self) -> &'static str {
"rss_feed"
}
fn matches(&self, url: &Url) -> bool {
Self::is_feed_url(url)
}
async fn fetch(
&self,
request: &FetchRequest,
options: &FetchOptions,
) -> Result<FetchResponse, FetchError> {
let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
let mut client_builder = reqwest::Client::builder()
.connect_timeout(API_TIMEOUT)
.timeout(API_TIMEOUT)
.redirect(reqwest::redirect::Policy::limited(5));
if !options.respect_proxy_env {
client_builder = client_builder.no_proxy();
}
let client = client_builder
.build()
.map_err(FetchError::ClientBuildError)?;
let ua_header = HeaderValue::from_str(user_agent)
.unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
let response = client
.get(&request.url)
.header(USER_AGENT, ua_header)
.header(
ACCEPT,
HeaderValue::from_static(
"application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
),
)
.send()
.await
.map_err(FetchError::from_reqwest)?;
let status_code = response.status().as_u16();
if !response.status().is_success() {
return Ok(FetchResponse {
url: request.url.clone(),
status_code,
error: Some(format!("HTTP {}", status_code)),
..Default::default()
});
}
let body = response
.text()
.await
.map_err(|e| FetchError::RequestError(e.to_string()))?;
let content = if body.contains("<rss") || body.contains("<channel>") {
parse_rss(&body)
} else if body.contains("<feed") && body.contains("xmlns=\"http://www.w3.org/2005/Atom\"") {
parse_atom(&body)
} else if body.contains("<feed") {
parse_atom(&body)
} else {
return Ok(FetchResponse {
url: request.url.clone(),
status_code: 200,
content: Some(body),
format: Some("raw".to_string()),
..Default::default()
});
};
Ok(FetchResponse {
url: request.url.clone(),
status_code: 200,
content_type: Some("text/markdown".to_string()),
format: Some("rss_feed".to_string()),
content: Some(content),
..Default::default()
})
}
}
fn parse_rss(xml: &str) -> String {
let mut out = String::new();
let feed_title = extract_first_tag(xml, "title").unwrap_or("RSS Feed".to_string());
out.push_str(&format!("# {}\n\n", decode_entities(&feed_title)));
if let Some(desc) = extract_first_tag(xml, "description") {
out.push_str(&format!("{}\n\n", decode_entities(&desc)));
}
if let Some(link) = extract_first_tag(xml, "link") {
out.push_str(&format!("- **Link:** {}\n", link));
}
let items = extract_blocks(xml, "item");
if !items.is_empty() {
out.push_str(&format!(
"\n## Entries ({})\n",
items.len().min(MAX_ENTRIES)
));
for item_xml in items.iter().take(MAX_ENTRIES) {
let title = extract_first_tag(item_xml, "title")
.map(|t| decode_entities(&t))
.unwrap_or_else(|| "(untitled)".to_string());
let link = extract_first_tag(item_xml, "link").unwrap_or_default();
let pub_date = extract_first_tag(item_xml, "pubDate");
let description =
extract_first_tag(item_xml, "description").map(|d| decode_entities(&d));
out.push_str(&format!("\n### {}\n\n", title));
if !link.is_empty() {
out.push_str(&format!("- **Link:** {}\n", link));
}
if let Some(date) = pub_date {
out.push_str(&format!("- **Published:** {}\n", date));
}
if let Some(desc) = description {
let cleaned = strip_html(&desc);
if !cleaned.is_empty() {
let truncated = if cleaned.len() > 500 {
format!("{}...", &cleaned[..500])
} else {
cleaned
};
out.push_str(&format!("\n{}\n", truncated));
}
}
}
}
out
}
fn parse_atom(xml: &str) -> String {
let mut out = String::new();
let feed_title = extract_first_tag(xml, "title").unwrap_or("Atom Feed".to_string());
out.push_str(&format!("# {}\n\n", decode_entities(&feed_title)));
if let Some(subtitle) = extract_first_tag(xml, "subtitle") {
out.push_str(&format!("{}\n\n", decode_entities(&subtitle)));
}
let entries = extract_blocks(xml, "entry");
if !entries.is_empty() {
out.push_str(&format!(
"\n## Entries ({})\n",
entries.len().min(MAX_ENTRIES)
));
for entry_xml in entries.iter().take(MAX_ENTRIES) {
let title = extract_first_tag(entry_xml, "title")
.map(|t| decode_entities(&t))
.unwrap_or_else(|| "(untitled)".to_string());
let link = extract_link_href(entry_xml).unwrap_or_default();
let updated = extract_first_tag(entry_xml, "updated");
let published = extract_first_tag(entry_xml, "published");
let summary = extract_first_tag(entry_xml, "summary").map(|s| decode_entities(&s));
let author = extract_first_tag(entry_xml, "name");
out.push_str(&format!("\n### {}\n\n", title));
if !link.is_empty() {
out.push_str(&format!("- **Link:** {}\n", link));
}
if let Some(author) = author {
out.push_str(&format!("- **Author:** {}\n", author));
}
if let Some(date) = published.or(updated) {
out.push_str(&format!("- **Published:** {}\n", date));
}
if let Some(summary) = summary {
let cleaned = strip_html(&summary);
if !cleaned.is_empty() {
let truncated = if cleaned.len() > 500 {
format!("{}...", &cleaned[..500])
} else {
cleaned
};
out.push_str(&format!("\n{}\n", truncated));
}
}
}
}
out
}
fn extract_first_tag(xml: &str, tag: &str) -> Option<String> {
let open = format!("<{}", tag);
let close = format!("</{}>", tag);
let start = xml.find(&open)?;
let content_start = xml[start..].find('>')? + start + 1;
let content_end = xml[content_start..].find(&close)? + content_start;
let content = &xml[content_start..content_end];
let content = content
.strip_prefix("<![CDATA[")
.and_then(|c| c.strip_suffix("]]>"))
.unwrap_or(content);
Some(content.trim().to_string())
}
fn extract_blocks(xml: &str, tag: &str) -> Vec<String> {
let open = format!("<{}", tag);
let close = format!("</{}>", tag);
let mut results = Vec::new();
let mut search_from = 0;
while let Some(start) = xml[search_from..].find(&open) {
let abs_start = search_from + start;
if let Some(end) = xml[abs_start..].find(&close) {
let block = &xml[abs_start..abs_start + end + close.len()];
results.push(block.to_string());
search_from = abs_start + end + close.len();
} else {
break;
}
}
results
}
fn extract_link_href(xml: &str) -> Option<String> {
let link_start = xml.find("<link")?;
let tag_end = xml[link_start..].find('>')? + link_start;
let tag = &xml[link_start..=tag_end];
let href_start = tag.find("href=\"")? + 6;
let href_end = tag[href_start..].find('"')? + href_start;
Some(tag[href_start..href_end].to_string())
}
fn decode_entities(s: &str) -> String {
s.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
}
fn strip_html(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for c in html.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(c),
_ => {}
}
}
result.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_feed_url() {
let url = Url::parse("https://example.com/feed").unwrap();
assert!(RSSFeedFetcher::is_feed_url(&url));
let url = Url::parse("https://example.com/rss.xml").unwrap();
assert!(RSSFeedFetcher::is_feed_url(&url));
let url = Url::parse("https://example.com/atom.xml").unwrap();
assert!(RSSFeedFetcher::is_feed_url(&url));
let url = Url::parse("https://example.com/blog/feed").unwrap();
assert!(RSSFeedFetcher::is_feed_url(&url));
let url = Url::parse("https://example.com/index.xml").unwrap();
assert!(RSSFeedFetcher::is_feed_url(&url));
let url = Url::parse("https://example.com/page").unwrap();
assert!(!RSSFeedFetcher::is_feed_url(&url));
}
#[test]
fn test_fetcher_matches() {
let fetcher = RSSFeedFetcher::new();
let url = Url::parse("https://blog.example.com/feed").unwrap();
assert!(fetcher.matches(&url));
let url = Url::parse("https://example.com/page").unwrap();
assert!(!fetcher.matches(&url));
}
#[test]
fn test_parse_rss() {
let xml = r#"<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>My Blog</title>
<description>A test blog</description>
<link>https://example.com</link>
<item>
<title>First Post</title>
<link>https://example.com/first</link>
<pubDate>Mon, 01 Jan 2024 00:00:00 GMT</pubDate>
<description>This is the first post.</description>
</item>
<item>
<title>Second Post</title>
<link>https://example.com/second</link>
<description><![CDATA[<p>HTML content</p>]]></description>
</item>
</channel>
</rss>"#;
let output = parse_rss(xml);
assert!(output.contains("# My Blog"));
assert!(output.contains("A test blog"));
assert!(output.contains("### First Post"));
assert!(output.contains("https://example.com/first"));
assert!(output.contains("This is the first post."));
assert!(output.contains("### Second Post"));
assert!(output.contains("HTML content"));
}
#[test]
fn test_parse_atom() {
let xml = r#"<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>My Blog</title>
<subtitle>A test blog</subtitle>
<entry>
<title>First Entry</title>
<link href="https://example.com/first"/>
<published>2024-01-01T00:00:00Z</published>
<author><name>Alice</name></author>
<summary>Entry summary here.</summary>
</entry>
</feed>"#;
let output = parse_atom(xml);
assert!(output.contains("# My Blog"));
assert!(output.contains("### First Entry"));
assert!(output.contains("https://example.com/first"));
assert!(output.contains("Alice"));
assert!(output.contains("Entry summary here."));
}
#[test]
fn test_decode_entities() {
assert_eq!(decode_entities("a & b"), "a & b");
assert_eq!(decode_entities("<tag>"), "<tag>");
}
#[test]
fn test_strip_html() {
assert_eq!(strip_html("<p>Hello <b>world</b></p>"), "Hello world");
}
}