use std::sync::Arc;
use quick_xml::Reader;
use quick_xml::events::{BytesStart, Event};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use crate::fetch::HttpFetchProvider;
use crate::search::{WebFetchBackend, WebFetchError, WebFetchRequest, WebFetchResponse};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FeedFormat {
Rss,
Atom,
JsonFeed,
Unknown,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FeedDiscoverySource {
AlternateLink,
CommonPath,
DirectUrl,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FeedProbeRequest {
pub url: String,
#[serde(default = "default_probe_common_paths")]
pub probe_common_paths: bool,
#[serde(default = "default_max_candidates")]
pub max_candidates: usize,
#[serde(default = "default_max_bytes")]
pub max_bytes: usize,
#[serde(default = "default_timeout_ms")]
pub timeout_ms: u64,
}
fn default_probe_common_paths() -> bool {
true
}
fn default_max_candidates() -> usize {
16
}
fn default_max_bytes() -> usize {
1_048_576
}
fn default_timeout_ms() -> u64 {
30_000
}
impl FeedProbeRequest {
#[must_use]
pub fn new(url: impl Into<String>) -> Self {
Self {
url: url.into(),
probe_common_paths: default_probe_common_paths(),
max_candidates: default_max_candidates(),
max_bytes: default_max_bytes(),
timeout_ms: default_timeout_ms(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct FeedEndpointCandidate {
pub url: String,
pub format_hint: FeedFormat,
pub discovery_source: FeedDiscoverySource,
pub confidence_bps: u16,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FeedProbeResponse {
pub provider: String,
pub input_url: String,
pub candidates: Vec<FeedEndpointCandidate>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FeedFetchRequest {
pub url: String,
#[serde(default)]
pub headers: Vec<(String, String)>,
#[serde(default = "default_max_bytes")]
pub max_bytes: usize,
#[serde(default = "default_timeout_ms")]
pub timeout_ms: u64,
}
impl FeedFetchRequest {
#[must_use]
pub fn new(url: impl Into<String>) -> Self {
Self {
url: url.into(),
headers: Vec::new(),
max_bytes: default_max_bytes(),
timeout_ms: default_timeout_ms(),
}
}
#[must_use]
pub fn with_header(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
self.headers.push((name.into(), value.into()));
self
}
}
impl From<&FeedFetchRequest> for WebFetchRequest {
fn from(request: &FeedFetchRequest) -> Self {
let mut fetch = WebFetchRequest::new(&request.url)
.with_max_bytes(request.max_bytes)
.with_timeout_ms(request.timeout_ms);
for (name, value) in &request.headers {
fetch = fetch.with_header(name, value);
}
fetch
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct FeedItem {
pub id: Option<String>,
pub title: Option<String>,
pub link: Option<String>,
pub summary: Option<String>,
pub published_at: Option<String>,
pub updated_at: Option<String>,
pub authors: Vec<String>,
pub categories: Vec<String>,
pub item_hash: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FeedFetchResponse {
pub provider: String,
pub url: String,
pub status: u16,
pub content_type: Option<String>,
pub format: FeedFormat,
pub raw_hash: String,
pub raw_body: String,
pub truncated: bool,
pub feed_title: Option<String>,
pub feed_link: Option<String>,
pub feed_updated_at: Option<String>,
pub items: Vec<FeedItem>,
}
#[derive(Debug, thiserror::Error)]
pub enum FeedError {
#[error("fetch error: {0}")]
Fetch(String),
#[error("invalid url: {0}")]
InvalidUrl(String),
#[error("parse error: {0}")]
Parse(String),
#[error("unsupported feed format")]
UnsupportedFormat,
}
impl From<WebFetchError> for FeedError {
fn from(error: WebFetchError) -> Self {
Self::Fetch(error.to_string())
}
}
pub trait FeedFetchBackend: Send + Sync {
fn provider_name(&self) -> &'static str;
fn probe(&self, request: &FeedProbeRequest) -> Result<FeedProbeResponse, FeedError>;
fn fetch_feed(&self, request: &FeedFetchRequest) -> Result<FeedFetchResponse, FeedError>;
}
#[derive(Clone)]
pub struct HttpFeedProvider {
fetch_backend: Arc<dyn WebFetchBackend>,
}
impl std::fmt::Debug for HttpFeedProvider {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HttpFeedProvider").finish_non_exhaustive()
}
}
impl Default for HttpFeedProvider {
fn default() -> Self {
Self::new()
}
}
impl HttpFeedProvider {
#[must_use]
pub fn new() -> Self {
Self {
fetch_backend: Arc::new(HttpFetchProvider::new()),
}
}
#[must_use]
pub fn with_fetch_backend(fetch_backend: Arc<dyn WebFetchBackend>) -> Self {
Self { fetch_backend }
}
}
impl FeedFetchBackend for HttpFeedProvider {
fn provider_name(&self) -> &'static str {
"http-feed"
}
fn probe(&self, request: &FeedProbeRequest) -> Result<FeedProbeResponse, FeedError> {
let input_url =
Url::parse(&request.url).map_err(|error| FeedError::InvalidUrl(error.to_string()))?;
let mut candidates = Vec::new();
if looks_like_feed_url(input_url.path()) {
candidates.push(FeedEndpointCandidate {
url: input_url.to_string(),
format_hint: format_hint_from_url(input_url.path()),
discovery_source: FeedDiscoverySource::DirectUrl,
confidence_bps: 9_000,
});
}
let fetch = WebFetchRequest::new(input_url.as_str())
.with_max_bytes(request.max_bytes)
.with_timeout_ms(request.timeout_ms);
if let Ok(response) = self.fetch_backend.fetch(&fetch) {
candidates.extend(discover_alternate_links(&response.body, &response.url)?);
}
if request.probe_common_paths {
candidates.extend(common_feed_candidates(&input_url));
}
dedup_candidates(&mut candidates);
candidates.truncate(request.max_candidates);
Ok(FeedProbeResponse {
provider: self.provider_name().into(),
input_url: request.url.clone(),
candidates,
})
}
fn fetch_feed(&self, request: &FeedFetchRequest) -> Result<FeedFetchResponse, FeedError> {
let fetch_request = WebFetchRequest::from(request);
let response = self.fetch_backend.fetch(&fetch_request)?;
parse_feed_response(self.provider_name(), response)
}
}
fn parse_feed_response(
provider_name: &str,
response: WebFetchResponse,
) -> Result<FeedFetchResponse, FeedError> {
let raw_hash = sha256(&response.body);
let parsed = parse_feed(&response.body)?;
Ok(FeedFetchResponse {
provider: provider_name.into(),
url: response.url,
status: response.status,
content_type: response.content_type,
format: parsed.format,
raw_hash,
raw_body: response.body,
truncated: response.truncated,
feed_title: parsed.feed_title,
feed_link: parsed.feed_link,
feed_updated_at: parsed.feed_updated_at,
items: parsed.items,
})
}
#[derive(Debug, Default)]
struct ParsedFeed {
format: FeedFormat,
feed_title: Option<String>,
feed_link: Option<String>,
feed_updated_at: Option<String>,
items: Vec<FeedItem>,
}
impl Default for FeedFormat {
fn default() -> Self {
Self::Unknown
}
}
fn parse_feed(body: &str) -> Result<ParsedFeed, FeedError> {
if let Ok(feed) = parse_json_feed(body) {
return Ok(feed);
}
parse_xml_feed(body)
}
#[derive(Debug, Deserialize)]
struct JsonFeed {
title: Option<String>,
home_page_url: Option<String>,
feed_url: Option<String>,
items: Vec<JsonFeedItem>,
}
#[derive(Debug, Deserialize)]
struct JsonFeedItem {
id: Option<String>,
title: Option<String>,
url: Option<String>,
external_url: Option<String>,
summary: Option<String>,
content_text: Option<String>,
date_published: Option<String>,
date_modified: Option<String>,
author: Option<JsonFeedAuthor>,
authors: Option<Vec<JsonFeedAuthor>>,
tags: Option<Vec<String>>,
}
#[derive(Debug, Deserialize)]
struct JsonFeedAuthor {
name: Option<String>,
}
fn parse_json_feed(body: &str) -> Result<ParsedFeed, FeedError> {
let json: JsonFeed =
serde_json::from_str(body).map_err(|error| FeedError::Parse(error.to_string()))?;
let feed_link = json.home_page_url.or(json.feed_url);
let items = json
.items
.into_iter()
.map(|item| {
let mut authors = item
.authors
.unwrap_or_default()
.into_iter()
.filter_map(|author| author.name)
.collect::<Vec<_>>();
if let Some(author) = item.author.and_then(|author| author.name) {
authors.push(author);
}
authors.sort();
authors.dedup();
let summary = item.summary.or(item.content_text);
let link = item.url.or(item.external_url);
let item_hash = item_hash(&item.id, &item.title, &link, &summary);
FeedItem {
id: item.id,
title: item.title,
link,
summary,
published_at: item.date_published,
updated_at: item.date_modified,
authors,
categories: item.tags.unwrap_or_default(),
item_hash,
}
})
.collect();
Ok(ParsedFeed {
format: FeedFormat::JsonFeed,
feed_title: json.title,
feed_link,
feed_updated_at: None,
items,
})
}
#[derive(Debug, Default)]
struct FeedItemBuilder {
id: Option<String>,
title: Option<String>,
link: Option<String>,
summary: Option<String>,
published_at: Option<String>,
updated_at: Option<String>,
authors: Vec<String>,
categories: Vec<String>,
}
impl FeedItemBuilder {
fn build(self) -> FeedItem {
let item_hash = item_hash(&self.id, &self.title, &self.link, &self.summary);
FeedItem {
id: self.id,
title: self.title,
link: self.link,
summary: self.summary,
published_at: self.published_at,
updated_at: self.updated_at,
authors: self.authors,
categories: self.categories,
item_hash,
}
}
}
fn parse_xml_feed(body: &str) -> Result<ParsedFeed, FeedError> {
let mut reader = Reader::from_str(body);
reader.config_mut().trim_text(true);
let mut feed = ParsedFeed::default();
let mut current_item: Option<FeedItemBuilder> = None;
loop {
match reader.read_event() {
Ok(Event::Start(start)) => {
let name = local_name(start.name().as_ref());
match name.as_str() {
"rss" | "rdf" => feed.format = FeedFormat::Rss,
"feed" => feed.format = FeedFormat::Atom,
"item" | "entry" => current_item = Some(FeedItemBuilder::default()),
_ if current_item.is_some() => {
read_item_start(&mut reader, &start, &name, current_item.as_mut());
}
"title" => feed.feed_title = read_text(&mut reader, &start)?,
"link" => {
if feed.format == FeedFormat::Atom {
feed.feed_link = attr_value(&reader, &start, b"href");
} else {
feed.feed_link = read_text(&mut reader, &start)?;
}
}
"updated" | "lastbuilddate" => {
feed.feed_updated_at = read_text(&mut reader, &start)?
}
_ => {}
}
}
Ok(Event::Empty(empty)) => {
let name = local_name(empty.name().as_ref());
if name == "link" {
if let Some(item) = current_item.as_mut() {
item.link = item
.link
.take()
.or_else(|| attr_value(&reader, &empty, b"href"));
} else if feed.format == FeedFormat::Atom {
feed.feed_link = feed
.feed_link
.take()
.or_else(|| attr_value(&reader, &empty, b"href"));
}
}
}
Ok(Event::End(end)) => {
let name = local_name(end.name().as_ref());
if matches!(name.as_str(), "item" | "entry") {
if let Some(item) = current_item.take() {
feed.items.push(item.build());
}
}
}
Ok(Event::Eof) => break,
Err(error) => return Err(FeedError::Parse(error.to_string())),
_ => {}
}
}
if feed.format == FeedFormat::Unknown {
return Err(FeedError::UnsupportedFormat);
}
Ok(feed)
}
fn read_item_start(
reader: &mut Reader<&[u8]>,
start: &BytesStart<'_>,
name: &str,
item: Option<&mut FeedItemBuilder>,
) {
let Some(item) = item else {
return;
};
match name {
"title" => item.title = read_text(reader, start).ok().flatten(),
"link" => {
item.link = attr_value(reader, start, b"href")
.or_else(|| read_text(reader, start).ok().flatten());
}
"guid" | "id" => item.id = read_text(reader, start).ok().flatten(),
"description" | "summary" | "content" | "encoded" => {
if item.summary.is_none() {
item.summary = read_text(reader, start).ok().flatten();
}
}
"pubdate" | "published" => item.published_at = read_text(reader, start).ok().flatten(),
"updated" => item.updated_at = read_text(reader, start).ok().flatten(),
"creator" | "author" | "name" => {
if let Some(author) = read_text(reader, start).ok().flatten() {
item.authors.push(author);
}
}
"category" => {
if let Some(category) = read_text(reader, start).ok().flatten() {
item.categories.push(category);
}
}
_ => {}
}
}
fn read_text(
reader: &mut Reader<&[u8]>,
start: &BytesStart<'_>,
) -> Result<Option<String>, FeedError> {
reader
.read_text(start.name())
.map(|text| {
let trimmed = text.trim();
(!trimmed.is_empty()).then_some(trimmed.to_string())
})
.map_err(|error| FeedError::Parse(error.to_string()))
}
fn local_name(name: &[u8]) -> String {
let raw = String::from_utf8_lossy(name).to_ascii_lowercase();
raw.rsplit(':').next().unwrap_or(&raw).to_string()
}
fn attr_value(reader: &Reader<&[u8]>, start: &BytesStart<'_>, key: &[u8]) -> Option<String> {
start
.attributes()
.flatten()
.find(|attribute| local_name(attribute.key.as_ref()).as_bytes() == key)
.and_then(|attribute| {
attribute
.decode_and_unescape_value(reader.decoder())
.ok()
.map(|value| value.into_owned())
})
}
fn discover_alternate_links(
body: &str,
base_url: &str,
) -> Result<Vec<FeedEndpointCandidate>, FeedError> {
let base = Url::parse(base_url).map_err(|error| FeedError::InvalidUrl(error.to_string()))?;
let link_re = regex_lite::Regex::new(r#"(?is)<link\s+[^>]*>"#)
.map_err(|error| FeedError::Parse(error.to_string()))?;
let attr_re = regex_lite::Regex::new(r#"(?is)([a-zA-Z_:.-]+)\s*=\s*["']([^"']+)["']"#)
.map_err(|error| FeedError::Parse(error.to_string()))?;
let mut candidates = Vec::new();
for link_match in link_re.find_iter(body) {
let tag = link_match.as_str();
let mut rel = None;
let mut content_type = None;
let mut href = None;
for capture in attr_re.captures_iter(tag) {
let Some(name) = capture.get(1) else {
continue;
};
let Some(value) = capture.get(2) else {
continue;
};
match name.as_str().to_ascii_lowercase().as_str() {
"rel" => rel = Some(value.as_str().to_ascii_lowercase()),
"type" => content_type = Some(value.as_str().to_ascii_lowercase()),
"href" => href = Some(value.as_str().to_string()),
_ => {}
}
}
if !rel.as_deref().unwrap_or_default().contains("alternate") {
continue;
}
let format_hint = content_type
.as_deref()
.map(format_hint_from_content_type)
.unwrap_or(FeedFormat::Unknown);
if format_hint == FeedFormat::Unknown {
continue;
}
let Some(href) = href else {
continue;
};
let url = base
.join(&href)
.map_err(|error| FeedError::InvalidUrl(error.to_string()))?;
candidates.push(FeedEndpointCandidate {
url: url.to_string(),
format_hint,
discovery_source: FeedDiscoverySource::AlternateLink,
confidence_bps: 8_500,
});
}
Ok(candidates)
}
fn common_feed_candidates(base_url: &Url) -> Vec<FeedEndpointCandidate> {
let common_paths = [
("/feed", FeedFormat::Rss),
("/feed/", FeedFormat::Rss),
("/rss", FeedFormat::Rss),
("/rss.xml", FeedFormat::Rss),
("/feed.xml", FeedFormat::Rss),
("/atom.xml", FeedFormat::Atom),
("/index.xml", FeedFormat::Rss),
("/feed.json", FeedFormat::JsonFeed),
];
common_paths
.into_iter()
.filter_map(|(path, format_hint)| {
base_url.join(path).ok().map(|url| FeedEndpointCandidate {
url: url.to_string(),
format_hint,
discovery_source: FeedDiscoverySource::CommonPath,
confidence_bps: 4_000,
})
})
.collect()
}
fn dedup_candidates(candidates: &mut Vec<FeedEndpointCandidate>) {
candidates.sort_by(|a, b| {
b.confidence_bps
.cmp(&a.confidence_bps)
.then_with(|| a.url.cmp(&b.url))
});
candidates.dedup_by(|a, b| a.url == b.url);
}
#[allow(clippy::case_sensitive_file_extension_comparisons)]
fn looks_like_feed_url(path: &str) -> bool {
let path = path.to_ascii_lowercase();
path.ends_with(".rss")
|| path.ends_with(".xml")
|| path.ends_with(".atom")
|| path.ends_with(".json")
|| path.ends_with("/feed")
|| path.ends_with("/feed/")
|| path.ends_with("/rss")
}
#[allow(clippy::case_sensitive_file_extension_comparisons)]
fn format_hint_from_url(path: &str) -> FeedFormat {
let path = path.to_ascii_lowercase();
if path.ends_with(".json") {
FeedFormat::JsonFeed
} else if path.contains("atom") {
FeedFormat::Atom
} else if looks_like_feed_url(&path) {
FeedFormat::Rss
} else {
FeedFormat::Unknown
}
}
fn format_hint_from_content_type(content_type: &str) -> FeedFormat {
if content_type.contains("json") {
FeedFormat::JsonFeed
} else if content_type.contains("atom") {
FeedFormat::Atom
} else if content_type.contains("rss") || content_type.contains("xml") {
FeedFormat::Rss
} else {
FeedFormat::Unknown
}
}
fn item_hash(
id: &Option<String>,
title: &Option<String>,
link: &Option<String>,
summary: &Option<String>,
) -> String {
sha256(&format!("{id:?}\n{title:?}\n{link:?}\n{summary:?}"))
}
fn sha256(value: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(value.as_bytes());
format!("sha256:{}", hex::encode(hasher.finalize()))
}
#[cfg(test)]
mod tests {
use super::*;
#[derive(Debug)]
struct StaticFetchBackend {
response: WebFetchResponse,
}
impl WebFetchBackend for StaticFetchBackend {
fn provider_name(&self) -> &'static str {
"static"
}
fn fetch(&self, _request: &WebFetchRequest) -> Result<WebFetchResponse, WebFetchError> {
Ok(self.response.clone())
}
}
#[test]
fn parses_rss_items_and_hashes_raw_body() {
let response = WebFetchResponse {
url: "https://example.test/feed.xml".into(),
status: 200,
content_type: Some("application/rss+xml".into()),
body: r#"
<rss version="2.0">
<channel>
<title>Local News</title>
<link>https://example.test</link>
<item>
<guid>abc</guid>
<title>Council update</title>
<link>https://example.test/a</link>
<description>Short summary.</description>
<pubDate>Thu, 30 Apr 2026 08:00:00 GMT</pubDate>
<category>Civic</category>
</item>
</channel>
</rss>
"#
.into(),
truncated: false,
};
let provider =
HttpFeedProvider::with_fetch_backend(Arc::new(StaticFetchBackend { response }));
let parsed = provider
.fetch_feed(&FeedFetchRequest::new("https://example.test/feed.xml"))
.unwrap();
assert_eq!(parsed.format, FeedFormat::Rss);
assert_eq!(parsed.feed_title.as_deref(), Some("Local News"));
assert_eq!(parsed.items.len(), 1);
assert_eq!(parsed.items[0].title.as_deref(), Some("Council update"));
assert_eq!(parsed.items[0].id.as_deref(), Some("abc"));
assert!(parsed.raw_hash.starts_with("sha256:"));
assert!(parsed.items[0].item_hash.starts_with("sha256:"));
}
#[test]
fn parses_atom_entries() {
let response = WebFetchResponse {
url: "https://example.test/atom.xml".into(),
status: 200,
content_type: Some("application/atom+xml".into()),
body: r#"
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Atom News</title>
<link href="https://example.test"/>
<entry>
<id>tag:example.test,2026:1</id>
<title>Match report</title>
<link href="https://example.test/match"/>
<summary>Färjestad update.</summary>
<updated>2026-04-30T10:00:00Z</updated>
</entry>
</feed>
"#
.into(),
truncated: false,
};
let provider =
HttpFeedProvider::with_fetch_backend(Arc::new(StaticFetchBackend { response }));
let parsed = provider
.fetch_feed(&FeedFetchRequest::new("https://example.test/atom.xml"))
.unwrap();
assert_eq!(parsed.format, FeedFormat::Atom);
assert_eq!(
parsed.items[0].link.as_deref(),
Some("https://example.test/match")
);
assert_eq!(
parsed.items[0].updated_at.as_deref(),
Some("2026-04-30T10:00:00Z")
);
}
#[test]
fn parses_json_feed_items() {
let response = WebFetchResponse {
url: "https://example.test/feed.json".into(),
status: 200,
content_type: Some("application/feed+json".into()),
body: r#"
{
"version": "https://jsonfeed.org/version/1.1",
"title": "JSON News",
"home_page_url": "https://example.test",
"items": [
{
"id": "1",
"url": "https://example.test/1",
"title": "Coffee company expands",
"summary": "Local business update.",
"date_published": "2026-04-30T09:00:00Z",
"tags": ["business"]
}
]
}
"#
.into(),
truncated: false,
};
let provider =
HttpFeedProvider::with_fetch_backend(Arc::new(StaticFetchBackend { response }));
let parsed = provider
.fetch_feed(&FeedFetchRequest::new("https://example.test/feed.json"))
.unwrap();
assert_eq!(parsed.format, FeedFormat::JsonFeed);
assert_eq!(parsed.feed_title.as_deref(), Some("JSON News"));
assert_eq!(parsed.items[0].categories, vec!["business"]);
}
#[test]
fn probe_discovers_alternate_feed_links_and_common_paths() {
let response = WebFetchResponse {
url: "https://example.test/".into(),
status: 200,
content_type: Some("text/html".into()),
body: r#"
<html>
<head>
<link rel="alternate" type="application/rss+xml" href="/rss.xml">
<link rel="alternate" type="application/atom+xml" href="https://example.test/atom.xml">
</head>
</html>
"#
.into(),
truncated: false,
};
let provider =
HttpFeedProvider::with_fetch_backend(Arc::new(StaticFetchBackend { response }));
let probe = provider
.probe(&FeedProbeRequest::new("https://example.test/"))
.unwrap();
assert!(probe.candidates.iter().any(|candidate| {
candidate.discovery_source == FeedDiscoverySource::AlternateLink
&& candidate.url == "https://example.test/rss.xml"
}));
assert!(probe.candidates.iter().any(|candidate| {
candidate.discovery_source == FeedDiscoverySource::CommonPath
&& candidate.url == "https://example.test/feed"
}));
}
}