stygian_graph/adapters/
rss_feed.rs1use crate::domain::error::{Result, ServiceError, StygianError};
25use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
26use async_trait::async_trait;
27use feed_rs::parser;
28use serde::{Deserialize, Serialize};
29use serde_json::json;
30
31#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
35pub struct FeedItem {
36 pub title: Option<String>,
38 pub link: Option<String>,
40 pub published: Option<String>,
42 pub summary: Option<String>,
44 pub categories: Vec<String>,
46 pub authors: Vec<String>,
48 pub id: String,
50}
51
52pub struct RssFeedAdapter {
59 client: reqwest::Client,
60}
61
62impl RssFeedAdapter {
63 pub fn new(client: reqwest::Client) -> Self {
65 Self { client }
66 }
67}
68
69#[async_trait]
70impl ScrapingService for RssFeedAdapter {
71 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
79 let resp = self
80 .client
81 .get(&input.url)
82 .header(
83 reqwest::header::ACCEPT,
84 "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
85 )
86 .send()
87 .await
88 .map_err(|e| {
89 StygianError::Service(ServiceError::Unavailable(format!("feed fetch failed: {e}")))
90 })?;
91
92 if !resp.status().is_success() {
93 return Err(StygianError::Service(ServiceError::InvalidResponse(
94 format!("feed returned HTTP {}", resp.status()),
95 )));
96 }
97
98 let bytes = resp.bytes().await.map_err(|e| {
99 StygianError::Service(ServiceError::Unavailable(format!(
100 "feed body read failed: {e}"
101 )))
102 })?;
103
104 let feed = parser::parse(&bytes[..]).map_err(|e| {
105 StygianError::Service(ServiceError::InvalidResponse(format!(
106 "feed parse failed: {e}"
107 )))
108 })?;
109
110 let mut items: Vec<FeedItem> = feed
112 .entries
113 .iter()
114 .map(|entry| {
115 let title = entry.title.as_ref().map(|t| t.content.clone());
116 let link = entry.links.first().map(|l| l.href.clone());
117 let published = entry.published.or(entry.updated).map(|dt| dt.to_rfc3339());
118 let summary = entry.summary.as_ref().map(|s| s.content.clone());
119 let categories = entry.categories.iter().map(|c| c.term.clone()).collect();
120 let authors = entry.authors.iter().map(|a| a.name.clone()).collect();
121 let id = entry.id.clone();
122
123 FeedItem {
124 title,
125 link,
126 published,
127 summary,
128 categories,
129 authors,
130 id,
131 }
132 })
133 .collect();
134
135 if let Some(since) = input.params.get("since").and_then(|v| v.as_str()) {
137 items.retain(|item| {
138 item.published
139 .as_deref()
140 .is_some_and(|pub_date| pub_date >= since)
141 });
142 }
143
144 if let Some(cats) = input.params.get("categories").and_then(|v| v.as_array()) {
145 let filter_cats: Vec<&str> = cats.iter().filter_map(|c| c.as_str()).collect();
146 if !filter_cats.is_empty() {
147 items.retain(|item| {
148 item.categories
149 .iter()
150 .any(|c| filter_cats.contains(&c.as_str()))
151 });
152 }
153 }
154
155 if let Some(limit) = input.params.get("limit").and_then(|v| v.as_u64()) {
156 items.truncate(limit as usize);
157 }
158
159 let count = items.len();
160 let data = serde_json::to_string(&items).map_err(|e| {
161 StygianError::Service(ServiceError::InvalidResponse(format!(
162 "feed serialization failed: {e}"
163 )))
164 })?;
165
166 let feed_title = feed.title.map(|t| t.content);
168 let feed_description = feed.description.map(|d| d.content);
169 let feed_updated = feed.updated.map(|dt| dt.to_rfc3339());
170
171 Ok(ServiceOutput {
172 data,
173 metadata: json!({
174 "source": "rss_feed",
175 "feed_title": feed_title,
176 "feed_description": feed_description,
177 "last_updated": feed_updated,
178 "item_count": count,
179 "source_url": input.url,
180 }),
181 })
182 }
183
184 fn name(&self) -> &'static str {
185 "rss_feed"
186 }
187}
188
189#[cfg(test)]
192mod tests {
193 use super::*;
194
195 const RSS_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
196<rss version="2.0">
197 <channel>
198 <title>Example Blog</title>
199 <link>https://example.com</link>
200 <description>An example RSS feed</description>
201 <item>
202 <title>First Post</title>
203 <link>https://example.com/post/1</link>
204 <pubDate>Mon, 01 Mar 2026 00:00:00 +0000</pubDate>
205 <description>Summary of first post</description>
206 <category>tech</category>
207 <guid>post-1</guid>
208 </item>
209 <item>
210 <title>Second Post</title>
211 <link>https://example.com/post/2</link>
212 <pubDate>Sun, 15 Feb 2026 00:00:00 +0000</pubDate>
213 <description>Summary of second post</description>
214 <category>science</category>
215 <guid>post-2</guid>
216 </item>
217 <item>
218 <title>Third Post</title>
219 <link>https://example.com/post/3</link>
220 <pubDate>Sat, 01 Feb 2026 00:00:00 +0000</pubDate>
221 <description>Summary of third post</description>
222 <category>tech</category>
223 <category>news</category>
224 <guid>post-3</guid>
225 </item>
226 </channel>
227</rss>"#;
228
229 const ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
230<feed xmlns="http://www.w3.org/2005/Atom">
231 <title>Example Atom Feed</title>
232 <link href="https://example.com"/>
233 <updated>2026-03-01T00:00:00Z</updated>
234 <entry>
235 <title>Atom Entry One</title>
236 <link href="https://example.com/atom/1"/>
237 <id>urn:uuid:atom-1</id>
238 <updated>2026-03-01T00:00:00Z</updated>
239 <summary>First atom entry</summary>
240 <author><name>Alice</name></author>
241 <category term="rust"/>
242 </entry>
243 <entry>
244 <title>Atom Entry Two</title>
245 <link href="https://example.com/atom/2"/>
246 <id>urn:uuid:atom-2</id>
247 <updated>2026-02-15T00:00:00Z</updated>
248 <summary>Second atom entry</summary>
249 <author><name>Bob</name></author>
250 </entry>
251</feed>"#;
252
253 fn parse_test_feed(xml: &str) -> Vec<FeedItem> {
254 let feed = parser::parse(xml.as_bytes()).expect("parse");
255 feed.entries
256 .iter()
257 .map(|entry| {
258 let title = entry.title.as_ref().map(|t| t.content.clone());
259 let link = entry.links.first().map(|l| l.href.clone());
260 let published = entry.published.or(entry.updated).map(|dt| dt.to_rfc3339());
261 let summary = entry.summary.as_ref().map(|s| s.content.clone());
262 let categories = entry.categories.iter().map(|c| c.term.clone()).collect();
263 let authors = entry.authors.iter().map(|a| a.name.clone()).collect();
264 let id = entry.id.clone();
265
266 FeedItem {
267 title,
268 link,
269 published,
270 summary,
271 categories,
272 authors,
273 id,
274 }
275 })
276 .collect()
277 }
278
279 #[test]
280 fn parse_rss_with_3_items() {
281 let items = parse_test_feed(RSS_FEED);
282 assert_eq!(items.len(), 3);
283 assert_eq!(items[0].title.as_deref(), Some("First Post"));
284 assert_eq!(items[0].link.as_deref(), Some("https://example.com/post/1"));
285 assert!(items[0].published.is_some());
286 assert_eq!(items[0].summary.as_deref(), Some("Summary of first post"));
287 }
288
289 #[test]
290 fn parse_atom_with_authors() {
291 let items = parse_test_feed(ATOM_FEED);
292 assert_eq!(items.len(), 2);
293 assert_eq!(items[0].title.as_deref(), Some("Atom Entry One"));
294 assert_eq!(items[0].authors, vec!["Alice".to_string()]);
295 assert_eq!(items[0].categories, vec!["rust".to_string()]);
296 assert_eq!(items[0].link.as_deref(), Some("https://example.com/atom/1"));
297 }
298
299 #[test]
300 fn filter_by_since_date() {
301 let mut items = parse_test_feed(RSS_FEED);
302 items.retain(|item| {
304 item.published
305 .as_deref()
306 .is_some_and(|pub_date| pub_date >= "2026-03-01")
307 });
308 assert_eq!(items.len(), 1);
309 assert_eq!(items[0].title.as_deref(), Some("First Post"));
310 }
311
312 #[test]
313 fn filter_by_categories() {
314 let mut items = parse_test_feed(RSS_FEED);
315 let filter_cats = vec!["tech"];
316 items.retain(|item| {
317 item.categories
318 .iter()
319 .any(|c| filter_cats.contains(&c.as_str()))
320 });
321 assert_eq!(items.len(), 2);
322 assert_eq!(items[0].title.as_deref(), Some("First Post"));
323 assert_eq!(items[1].title.as_deref(), Some("Third Post"));
324 }
325
326 #[test]
327 fn empty_feed_returns_empty_array() {
328 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
329<rss version="2.0">
330 <channel>
331 <title>Empty Feed</title>
332 </channel>
333</rss>"#;
334 let items = parse_test_feed(xml);
335 assert!(items.is_empty());
336 }
337
338 #[test]
339 fn malformed_feed_returns_error() {
340 let bad = b"<not-a-feed><broken";
341 let result = parser::parse(&bad[..]);
342 assert!(result.is_err());
343 }
344
345 #[test]
346 fn limit_truncates_items() {
347 let mut items = parse_test_feed(RSS_FEED);
348 assert_eq!(items.len(), 3);
349 items.truncate(2);
350 assert_eq!(items.len(), 2);
351 }
352
353 #[test]
354 fn rss_items_have_ids() {
355 let items = parse_test_feed(RSS_FEED);
356 assert!(!items[0].id.is_empty());
357 assert!(!items[1].id.is_empty());
358 assert!(!items[2].id.is_empty());
359 }
360
361 #[test]
362 fn atom_feed_has_categories() {
363 let items = parse_test_feed(ATOM_FEED);
364 assert_eq!(items[0].categories, vec!["rust"]);
365 assert!(items[1].categories.is_empty());
366 }
367}