cortex_runtime/acquisition/
feed_parser.rs1use super::http_client::HttpClient;
7
8#[derive(Debug, Clone)]
10pub struct FeedEntry {
11 pub url: String,
13 pub title: Option<String>,
15 pub published: Option<String>,
17}
18
19pub async fn discover_feeds(html: &str, domain: &str, client: &HttpClient) -> Vec<FeedEntry> {
25 let html_owned = html.to_string();
27 let domain_owned = domain.to_string();
28 let feed_urls =
29 tokio::task::spawn_blocking(move || discover_feed_urls_sync(&html_owned, &domain_owned))
30 .await
31 .unwrap_or_default();
32
33 let mut entries = Vec::new();
34
35 for feed_url in &feed_urls {
36 if let Ok(resp) = client.get(feed_url, 5000).await {
37 if resp.status == 200 {
38 let mut parsed = parse_feed(&resp.body);
39 entries.append(&mut parsed);
40 if entries.len() >= 500 {
41 break;
42 }
43 }
44 }
45 }
46
47 entries
48}
49
50fn discover_feed_urls_sync(html: &str, domain: &str) -> Vec<String> {
52 use scraper::{Html, Selector};
53
54 let document = Html::parse_document(html);
55 let mut urls = Vec::new();
56
57 if let Ok(sel) = Selector::parse(r#"link[type="application/rss+xml"]"#) {
59 for el in document.select(&sel) {
60 if let Some(href) = el.value().attr("href") {
61 let resolved = resolve_url(href, domain);
62 if !urls.contains(&resolved) {
63 urls.push(resolved);
64 }
65 }
66 }
67 }
68
69 if let Ok(sel) = Selector::parse(r#"link[type="application/atom+xml"]"#) {
71 for el in document.select(&sel) {
72 if let Some(href) = el.value().attr("href") {
73 let resolved = resolve_url(href, domain);
74 if !urls.contains(&resolved) {
75 urls.push(resolved);
76 }
77 }
78 }
79 }
80
81 let common_paths = ["/feed", "/rss", "/atom.xml", "/feed.xml", "/rss.xml"];
83 for path in &common_paths {
84 let url = format!("https://{domain}{path}");
85 if !urls.contains(&url) {
86 urls.push(url);
87 }
88 }
89
90 urls
91}
92
93fn resolve_url(href: &str, domain: &str) -> String {
94 if href.starts_with("http://") || href.starts_with("https://") {
95 href.to_string()
96 } else if href.starts_with('/') {
97 format!("https://{domain}{href}")
98 } else {
99 format!("https://{domain}/{href}")
100 }
101}
102
103fn parse_feed(xml: &str) -> Vec<FeedEntry> {
105 let mut entries = Vec::new();
106
107 if xml.contains("<rss") || xml.contains("<channel>") {
109 entries = parse_rss(xml);
110 }
111
112 if entries.is_empty() && (xml.contains("<feed") || xml.contains("<entry>")) {
114 entries = parse_atom(xml);
115 }
116
117 entries
118}
119
120fn parse_rss(xml: &str) -> Vec<FeedEntry> {
121 let mut entries = Vec::new();
122 let mut in_item = false;
123 let mut current_url = String::new();
124 let mut current_title: Option<String> = None;
125 let mut current_date: Option<String> = None;
126 let mut current_tag = String::new();
127
128 let mut reader = quick_xml::Reader::from_str(xml);
129 reader.config_mut().trim_text(true);
130 let mut buf = Vec::new();
131
132 loop {
133 match reader.read_event_into(&mut buf) {
134 Ok(quick_xml::events::Event::Start(ref e)) => {
135 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
136 if name == "item" {
137 in_item = true;
138 current_url.clear();
139 current_title = None;
140 current_date = None;
141 }
142 current_tag = name;
143 }
144 Ok(quick_xml::events::Event::Text(ref e)) => {
145 if in_item {
146 let text = e.unescape().unwrap_or_default().to_string();
147 let trimmed = text.trim().to_string();
148 if !trimmed.is_empty() {
149 match current_tag.as_str() {
150 "link" => current_url = trimmed,
151 "title" => current_title = Some(trimmed),
152 "pubDate" | "dc:date" => current_date = Some(trimmed),
153 _ => {}
154 }
155 }
156 }
157 }
158 Ok(quick_xml::events::Event::End(ref e)) => {
159 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
160 if name == "item" && in_item {
161 if !current_url.is_empty() {
162 entries.push(FeedEntry {
163 url: current_url.clone(),
164 title: current_title.clone(),
165 published: current_date.clone(),
166 });
167 }
168 in_item = false;
169 }
170 }
171 Ok(quick_xml::events::Event::Eof) => break,
172 Err(_) => break,
173 _ => {}
174 }
175 buf.clear();
176 }
177
178 entries
179}
180
181fn parse_atom(xml: &str) -> Vec<FeedEntry> {
182 let mut entries = Vec::new();
183 let mut in_entry = false;
184 let mut current_url = String::new();
185 let mut current_title: Option<String> = None;
186 let mut current_date: Option<String> = None;
187 let mut current_tag = String::new();
188
189 let mut reader = quick_xml::Reader::from_str(xml);
190 reader.config_mut().trim_text(true);
191 let mut buf = Vec::new();
192
193 loop {
194 match reader.read_event_into(&mut buf) {
195 Ok(quick_xml::events::Event::Start(ref e))
196 | Ok(quick_xml::events::Event::Empty(ref e)) => {
197 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
198 if name == "entry" {
199 in_entry = true;
200 current_url.clear();
201 current_title = None;
202 current_date = None;
203 }
204 if in_entry && name == "link" {
205 for attr in e.attributes().flatten() {
206 if attr.key.as_ref() == b"href" {
207 current_url = String::from_utf8_lossy(&attr.value).to_string();
208 }
209 }
210 }
211 current_tag = name;
212 }
213 Ok(quick_xml::events::Event::Text(ref e)) => {
214 if in_entry {
215 let text = e.unescape().unwrap_or_default().to_string();
216 let trimmed = text.trim().to_string();
217 if !trimmed.is_empty() {
218 match current_tag.as_str() {
219 "title" => current_title = Some(trimmed),
220 "published" | "updated" => current_date = Some(trimmed),
221 _ => {}
222 }
223 }
224 }
225 }
226 Ok(quick_xml::events::Event::End(ref e)) => {
227 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
228 if name == "entry" && in_entry {
229 if !current_url.is_empty() {
230 entries.push(FeedEntry {
231 url: current_url.clone(),
232 title: current_title.clone(),
233 published: current_date.clone(),
234 });
235 }
236 in_entry = false;
237 }
238 }
239 Ok(quick_xml::events::Event::Eof) => break,
240 Err(_) => break,
241 _ => {}
242 }
243 buf.clear();
244 }
245
246 entries
247}
248
249#[cfg(test)]
250mod tests {
251 use super::*;
252
253 #[test]
254 fn test_discover_feed_urls_sync() {
255 let html = r#"
256 <html><head>
257 <link rel="alternate" type="application/rss+xml" href="/feed.xml" title="RSS" />
258 <link rel="alternate" type="application/atom+xml" href="https://example.com/atom" />
259 </head><body></body></html>
260 "#;
261
262 let urls = discover_feed_urls_sync(html, "example.com");
263 assert!(urls.iter().any(|u| u.contains("feed.xml")));
264 assert!(urls.iter().any(|u| u.contains("atom")));
265 }
266
267 #[test]
268 fn test_parse_rss() {
269 let xml = r#"<?xml version="1.0"?>
270 <rss version="2.0">
271 <channel>
272 <title>Test</title>
273 <item>
274 <title>Post 1</title>
275 <link>https://example.com/post-1</link>
276 <pubDate>Mon, 01 Jan 2026 00:00:00 GMT</pubDate>
277 </item>
278 <item>
279 <title>Post 2</title>
280 <link>https://example.com/post-2</link>
281 </item>
282 </channel>
283 </rss>"#;
284
285 let entries = parse_rss(xml);
286 assert_eq!(entries.len(), 2);
287 assert_eq!(entries[0].url, "https://example.com/post-1");
288 assert_eq!(entries[0].title.as_deref(), Some("Post 1"));
289 assert!(entries[0].published.is_some());
290 }
291
292 #[test]
293 fn test_parse_atom() {
294 let xml = r#"<?xml version="1.0"?>
295 <feed xmlns="http://www.w3.org/2005/Atom">
296 <title>Test</title>
297 <entry>
298 <title>Entry 1</title>
299 <link href="https://example.com/entry-1" />
300 <published>2026-01-15T00:00:00Z</published>
301 </entry>
302 </feed>"#;
303
304 let entries = parse_atom(xml);
305 assert_eq!(entries.len(), 1);
306 assert_eq!(entries[0].url, "https://example.com/entry-1");
307 assert_eq!(entries[0].title.as_deref(), Some("Entry 1"));
308 }
309
310 #[test]
311 fn test_resolve_url() {
312 assert_eq!(
313 resolve_url("/feed.xml", "example.com"),
314 "https://example.com/feed.xml"
315 );
316 assert_eq!(
317 resolve_url("https://example.com/rss", "example.com"),
318 "https://example.com/rss"
319 );
320 }
321}