Skip to main content

cortex_runtime/acquisition/
feed_parser.rs

1//! RSS/Atom feed discovery and parsing.
2//!
3//! Finds RSS and Atom links in homepage HTML, tries common feed paths,
4//! and parses discovered feeds to extract URLs.
5
6use super::http_client::HttpClient;
7
8/// An entry discovered from an RSS/Atom feed.
9#[derive(Debug, Clone)]
10pub struct FeedEntry {
11    /// The URL of the feed entry.
12    pub url: String,
13    /// Title of the entry (if available).
14    pub title: Option<String>,
15    /// Publication date (if available).
16    pub published: Option<String>,
17}
18
19/// Discover and parse RSS/Atom feeds for a domain.
20///
21/// 1. Finds `<link rel="alternate" type="application/rss+xml">` in HTML.
22/// 2. Tries common feed paths: /feed, /rss, /atom.xml, /feed.xml, /rss.xml.
23/// 3. Parses discovered feeds and returns entries.
24pub async fn discover_feeds(html: &str, domain: &str, client: &HttpClient) -> Vec<FeedEntry> {
25    // Extract feed URLs in a blocking task (uses scraper which is not Send)
26    let html_owned = html.to_string();
27    let domain_owned = domain.to_string();
28    let feed_urls =
29        tokio::task::spawn_blocking(move || discover_feed_urls_sync(&html_owned, &domain_owned))
30            .await
31            .unwrap_or_default();
32
33    let mut entries = Vec::new();
34
35    for feed_url in &feed_urls {
36        if let Ok(resp) = client.get(feed_url, 5000).await {
37            if resp.status == 200 {
38                let mut parsed = parse_feed(&resp.body);
39                entries.append(&mut parsed);
40                if entries.len() >= 500 {
41                    break;
42                }
43            }
44        }
45    }
46
47    entries
48}
49
50/// Find feed URLs from `<link>` tags in HTML (sync, uses scraper).
51fn discover_feed_urls_sync(html: &str, domain: &str) -> Vec<String> {
52    use scraper::{Html, Selector};
53
54    let document = Html::parse_document(html);
55    let mut urls = Vec::new();
56
57    // RSS feeds
58    if let Ok(sel) = Selector::parse(r#"link[type="application/rss+xml"]"#) {
59        for el in document.select(&sel) {
60            if let Some(href) = el.value().attr("href") {
61                let resolved = resolve_url(href, domain);
62                if !urls.contains(&resolved) {
63                    urls.push(resolved);
64                }
65            }
66        }
67    }
68
69    // Atom feeds
70    if let Ok(sel) = Selector::parse(r#"link[type="application/atom+xml"]"#) {
71        for el in document.select(&sel) {
72            if let Some(href) = el.value().attr("href") {
73                let resolved = resolve_url(href, domain);
74                if !urls.contains(&resolved) {
75                    urls.push(resolved);
76                }
77            }
78        }
79    }
80
81    // Add common paths
82    let common_paths = ["/feed", "/rss", "/atom.xml", "/feed.xml", "/rss.xml"];
83    for path in &common_paths {
84        let url = format!("https://{domain}{path}");
85        if !urls.contains(&url) {
86            urls.push(url);
87        }
88    }
89
90    urls
91}
92
93fn resolve_url(href: &str, domain: &str) -> String {
94    if href.starts_with("http://") || href.starts_with("https://") {
95        href.to_string()
96    } else if href.starts_with('/') {
97        format!("https://{domain}{href}")
98    } else {
99        format!("https://{domain}/{href}")
100    }
101}
102
103/// Parse RSS 2.0 or Atom feed XML into entries.
104fn parse_feed(xml: &str) -> Vec<FeedEntry> {
105    let mut entries = Vec::new();
106
107    // Try RSS 2.0 first
108    if xml.contains("<rss") || xml.contains("<channel>") {
109        entries = parse_rss(xml);
110    }
111
112    // Try Atom
113    if entries.is_empty() && (xml.contains("<feed") || xml.contains("<entry>")) {
114        entries = parse_atom(xml);
115    }
116
117    entries
118}
119
120fn parse_rss(xml: &str) -> Vec<FeedEntry> {
121    let mut entries = Vec::new();
122    let mut in_item = false;
123    let mut current_url = String::new();
124    let mut current_title: Option<String> = None;
125    let mut current_date: Option<String> = None;
126    let mut current_tag = String::new();
127
128    let mut reader = quick_xml::Reader::from_str(xml);
129    reader.config_mut().trim_text(true);
130    let mut buf = Vec::new();
131
132    loop {
133        match reader.read_event_into(&mut buf) {
134            Ok(quick_xml::events::Event::Start(ref e)) => {
135                let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
136                if name == "item" {
137                    in_item = true;
138                    current_url.clear();
139                    current_title = None;
140                    current_date = None;
141                }
142                current_tag = name;
143            }
144            Ok(quick_xml::events::Event::Text(ref e)) => {
145                if in_item {
146                    let text = e.unescape().unwrap_or_default().to_string();
147                    let trimmed = text.trim().to_string();
148                    if !trimmed.is_empty() {
149                        match current_tag.as_str() {
150                            "link" => current_url = trimmed,
151                            "title" => current_title = Some(trimmed),
152                            "pubDate" | "dc:date" => current_date = Some(trimmed),
153                            _ => {}
154                        }
155                    }
156                }
157            }
158            Ok(quick_xml::events::Event::End(ref e)) => {
159                let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
160                if name == "item" && in_item {
161                    if !current_url.is_empty() {
162                        entries.push(FeedEntry {
163                            url: current_url.clone(),
164                            title: current_title.clone(),
165                            published: current_date.clone(),
166                        });
167                    }
168                    in_item = false;
169                }
170            }
171            Ok(quick_xml::events::Event::Eof) => break,
172            Err(_) => break,
173            _ => {}
174        }
175        buf.clear();
176    }
177
178    entries
179}
180
181fn parse_atom(xml: &str) -> Vec<FeedEntry> {
182    let mut entries = Vec::new();
183    let mut in_entry = false;
184    let mut current_url = String::new();
185    let mut current_title: Option<String> = None;
186    let mut current_date: Option<String> = None;
187    let mut current_tag = String::new();
188
189    let mut reader = quick_xml::Reader::from_str(xml);
190    reader.config_mut().trim_text(true);
191    let mut buf = Vec::new();
192
193    loop {
194        match reader.read_event_into(&mut buf) {
195            Ok(quick_xml::events::Event::Start(ref e))
196            | Ok(quick_xml::events::Event::Empty(ref e)) => {
197                let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
198                if name == "entry" {
199                    in_entry = true;
200                    current_url.clear();
201                    current_title = None;
202                    current_date = None;
203                }
204                if in_entry && name == "link" {
205                    for attr in e.attributes().flatten() {
206                        if attr.key.as_ref() == b"href" {
207                            current_url = String::from_utf8_lossy(&attr.value).to_string();
208                        }
209                    }
210                }
211                current_tag = name;
212            }
213            Ok(quick_xml::events::Event::Text(ref e)) => {
214                if in_entry {
215                    let text = e.unescape().unwrap_or_default().to_string();
216                    let trimmed = text.trim().to_string();
217                    if !trimmed.is_empty() {
218                        match current_tag.as_str() {
219                            "title" => current_title = Some(trimmed),
220                            "published" | "updated" => current_date = Some(trimmed),
221                            _ => {}
222                        }
223                    }
224                }
225            }
226            Ok(quick_xml::events::Event::End(ref e)) => {
227                let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
228                if name == "entry" && in_entry {
229                    if !current_url.is_empty() {
230                        entries.push(FeedEntry {
231                            url: current_url.clone(),
232                            title: current_title.clone(),
233                            published: current_date.clone(),
234                        });
235                    }
236                    in_entry = false;
237                }
238            }
239            Ok(quick_xml::events::Event::Eof) => break,
240            Err(_) => break,
241            _ => {}
242        }
243        buf.clear();
244    }
245
246    entries
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252
253    #[test]
254    fn test_discover_feed_urls_sync() {
255        let html = r#"
256        <html><head>
257        <link rel="alternate" type="application/rss+xml" href="/feed.xml" title="RSS" />
258        <link rel="alternate" type="application/atom+xml" href="https://example.com/atom" />
259        </head><body></body></html>
260        "#;
261
262        let urls = discover_feed_urls_sync(html, "example.com");
263        assert!(urls.iter().any(|u| u.contains("feed.xml")));
264        assert!(urls.iter().any(|u| u.contains("atom")));
265    }
266
267    #[test]
268    fn test_parse_rss() {
269        let xml = r#"<?xml version="1.0"?>
270        <rss version="2.0">
271        <channel>
272        <title>Test</title>
273        <item>
274            <title>Post 1</title>
275            <link>https://example.com/post-1</link>
276            <pubDate>Mon, 01 Jan 2026 00:00:00 GMT</pubDate>
277        </item>
278        <item>
279            <title>Post 2</title>
280            <link>https://example.com/post-2</link>
281        </item>
282        </channel>
283        </rss>"#;
284
285        let entries = parse_rss(xml);
286        assert_eq!(entries.len(), 2);
287        assert_eq!(entries[0].url, "https://example.com/post-1");
288        assert_eq!(entries[0].title.as_deref(), Some("Post 1"));
289        assert!(entries[0].published.is_some());
290    }
291
292    #[test]
293    fn test_parse_atom() {
294        let xml = r#"<?xml version="1.0"?>
295        <feed xmlns="http://www.w3.org/2005/Atom">
296        <title>Test</title>
297        <entry>
298            <title>Entry 1</title>
299            <link href="https://example.com/entry-1" />
300            <published>2026-01-15T00:00:00Z</published>
301        </entry>
302        </feed>"#;
303
304        let entries = parse_atom(xml);
305        assert_eq!(entries.len(), 1);
306        assert_eq!(entries[0].url, "https://example.com/entry-1");
307        assert_eq!(entries[0].title.as_deref(), Some("Entry 1"));
308    }
309
310    #[test]
311    fn test_resolve_url() {
312        assert_eq!(
313            resolve_url("/feed.xml", "example.com"),
314            "https://example.com/feed.xml"
315        );
316        assert_eq!(
317            resolve_url("https://example.com/rss", "example.com"),
318            "https://example.com/rss"
319        );
320    }
321}