Skip to main content

stygian_graph/adapters/
rss_feed.rs

1//! RSS / Atom feed [`ScrapingService`](crate::ports::ScrapingService) adapter
2//!
3//! Parses RSS 1.0, RSS 2.0, Atom, and JSON Feed formats via the `feed-rs`
4//! crate, returning feed items as structured JSON for downstream pipeline nodes.
5//!
6//! # Example
7//!
8//! ```no_run
9//! use stygian_graph::adapters::rss_feed::RssFeedAdapter;
10//! use stygian_graph::ports::{ScrapingService, ServiceInput};
11//! use serde_json::json;
12//!
13//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
14//! let adapter = RssFeedAdapter::new(reqwest::Client::new());
15//! let input = ServiceInput {
16//!     url: "https://example.com/feed.xml".into(),
17//!     params: json!({}),
18//! };
19//! let output = adapter.execute(input).await.unwrap();
20//! println!("{}", output.data); // JSON array of feed items
21//! # });
22//! ```
23
24use crate::domain::error::{Result, ServiceError, StygianError};
25use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
26use async_trait::async_trait;
27use feed_rs::parser;
28use serde::{Deserialize, Serialize};
29use serde_json::json;
30
31// ─── Domain types ─────────────────────────────────────────────────────────────
32
33/// A single feed item extracted from RSS/Atom.
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
35pub struct FeedItem {
36    /// Item title.
37    pub title: Option<String>,
38    /// Primary link URL.
39    pub link: Option<String>,
40    /// Published or updated timestamp (ISO 8601).
41    pub published: Option<String>,
42    /// Item summary / description.
43    pub summary: Option<String>,
44    /// Category labels.
45    pub categories: Vec<String>,
46    /// Author names.
47    pub authors: Vec<String>,
48    /// Unique identifier (guid / id).
49    pub id: String,
50}
51
52// ─── Adapter ──────────────────────────────────────────────────────────────────
53
54/// RSS / Atom feed source adapter.
55///
56/// Fetches and parses feeds using the `feed-rs` crate which handles
57/// RSS 1.0, RSS 2.0, Atom, and JSON Feed formats transparently.
58pub struct RssFeedAdapter {
59    client: reqwest::Client,
60}
61
62impl RssFeedAdapter {
63    /// Create a new RSS feed adapter.
64    pub fn new(client: reqwest::Client) -> Self {
65        Self { client }
66    }
67}
68
69#[async_trait]
70impl ScrapingService for RssFeedAdapter {
71    /// Fetch and parse a feed, returning items as JSON.
72    ///
73    /// # Params (optional)
74    ///
75    /// * `since` — ISO 8601 datetime string; exclude items published before this.
76    /// * `limit` — integer; maximum number of items to return.
77    /// * `categories` — array of strings; only include items matching any of these categories.
78    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
79        let resp = self
80            .client
81            .get(&input.url)
82            .header(
83                reqwest::header::ACCEPT,
84                "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
85            )
86            .send()
87            .await
88            .map_err(|e| {
89                StygianError::Service(ServiceError::Unavailable(format!("feed fetch failed: {e}")))
90            })?;
91
92        if !resp.status().is_success() {
93            return Err(StygianError::Service(ServiceError::InvalidResponse(
94                format!("feed returned HTTP {}", resp.status()),
95            )));
96        }
97
98        let bytes = resp.bytes().await.map_err(|e| {
99            StygianError::Service(ServiceError::Unavailable(format!(
100                "feed body read failed: {e}"
101            )))
102        })?;
103
104        let feed = parser::parse(&bytes[..]).map_err(|e| {
105            StygianError::Service(ServiceError::InvalidResponse(format!(
106                "feed parse failed: {e}"
107            )))
108        })?;
109
110        // Convert feed entries into our domain type
111        let mut items: Vec<FeedItem> = feed
112            .entries
113            .iter()
114            .map(|entry| {
115                let title = entry.title.as_ref().map(|t| t.content.clone());
116                let link = entry.links.first().map(|l| l.href.clone());
117                let published = entry.published.or(entry.updated).map(|dt| dt.to_rfc3339());
118                let summary = entry.summary.as_ref().map(|s| s.content.clone());
119                let categories = entry.categories.iter().map(|c| c.term.clone()).collect();
120                let authors = entry.authors.iter().map(|a| a.name.clone()).collect();
121                let id = entry.id.clone();
122
123                FeedItem {
124                    title,
125                    link,
126                    published,
127                    summary,
128                    categories,
129                    authors,
130                    id,
131                }
132            })
133            .collect();
134
135        // Apply optional filters
136        if let Some(since) = input.params.get("since").and_then(|v| v.as_str()) {
137            items.retain(|item| {
138                item.published
139                    .as_deref()
140                    .is_some_and(|pub_date| pub_date >= since)
141            });
142        }
143
144        if let Some(cats) = input.params.get("categories").and_then(|v| v.as_array()) {
145            let filter_cats: Vec<&str> = cats.iter().filter_map(|c| c.as_str()).collect();
146            if !filter_cats.is_empty() {
147                items.retain(|item| {
148                    item.categories
149                        .iter()
150                        .any(|c| filter_cats.contains(&c.as_str()))
151                });
152            }
153        }
154
155        if let Some(limit) = input.params.get("limit").and_then(|v| v.as_u64()) {
156            items.truncate(limit as usize);
157        }
158
159        let count = items.len();
160        let data = serde_json::to_string(&items).map_err(|e| {
161            StygianError::Service(ServiceError::InvalidResponse(format!(
162                "feed serialization failed: {e}"
163            )))
164        })?;
165
166        // Feed-level metadata
167        let feed_title = feed.title.map(|t| t.content);
168        let feed_description = feed.description.map(|d| d.content);
169        let feed_updated = feed.updated.map(|dt| dt.to_rfc3339());
170
171        Ok(ServiceOutput {
172            data,
173            metadata: json!({
174                "source": "rss_feed",
175                "feed_title": feed_title,
176                "feed_description": feed_description,
177                "last_updated": feed_updated,
178                "item_count": count,
179                "source_url": input.url,
180            }),
181        })
182    }
183
184    fn name(&self) -> &'static str {
185        "rss_feed"
186    }
187}
188
189// ─── Tests ────────────────────────────────────────────────────────────────────
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194
195    const RSS_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
196<rss version="2.0">
197  <channel>
198    <title>Example Blog</title>
199    <link>https://example.com</link>
200    <description>An example RSS feed</description>
201    <item>
202      <title>First Post</title>
203      <link>https://example.com/post/1</link>
204      <pubDate>Mon, 01 Mar 2026 00:00:00 +0000</pubDate>
205      <description>Summary of first post</description>
206      <category>tech</category>
207      <guid>post-1</guid>
208    </item>
209    <item>
210      <title>Second Post</title>
211      <link>https://example.com/post/2</link>
212      <pubDate>Sun, 15 Feb 2026 00:00:00 +0000</pubDate>
213      <description>Summary of second post</description>
214      <category>science</category>
215      <guid>post-2</guid>
216    </item>
217    <item>
218      <title>Third Post</title>
219      <link>https://example.com/post/3</link>
220      <pubDate>Sat, 01 Feb 2026 00:00:00 +0000</pubDate>
221      <description>Summary of third post</description>
222      <category>tech</category>
223      <category>news</category>
224      <guid>post-3</guid>
225    </item>
226  </channel>
227</rss>"#;
228
229    const ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
230<feed xmlns="http://www.w3.org/2005/Atom">
231  <title>Example Atom Feed</title>
232  <link href="https://example.com"/>
233  <updated>2026-03-01T00:00:00Z</updated>
234  <entry>
235    <title>Atom Entry One</title>
236    <link href="https://example.com/atom/1"/>
237    <id>urn:uuid:atom-1</id>
238    <updated>2026-03-01T00:00:00Z</updated>
239    <summary>First atom entry</summary>
240    <author><name>Alice</name></author>
241    <category term="rust"/>
242  </entry>
243  <entry>
244    <title>Atom Entry Two</title>
245    <link href="https://example.com/atom/2"/>
246    <id>urn:uuid:atom-2</id>
247    <updated>2026-02-15T00:00:00Z</updated>
248    <summary>Second atom entry</summary>
249    <author><name>Bob</name></author>
250  </entry>
251</feed>"#;
252
253    fn parse_test_feed(xml: &str) -> Vec<FeedItem> {
254        let feed = parser::parse(xml.as_bytes()).expect("parse");
255        feed.entries
256            .iter()
257            .map(|entry| {
258                let title = entry.title.as_ref().map(|t| t.content.clone());
259                let link = entry.links.first().map(|l| l.href.clone());
260                let published = entry.published.or(entry.updated).map(|dt| dt.to_rfc3339());
261                let summary = entry.summary.as_ref().map(|s| s.content.clone());
262                let categories = entry.categories.iter().map(|c| c.term.clone()).collect();
263                let authors = entry.authors.iter().map(|a| a.name.clone()).collect();
264                let id = entry.id.clone();
265
266                FeedItem {
267                    title,
268                    link,
269                    published,
270                    summary,
271                    categories,
272                    authors,
273                    id,
274                }
275            })
276            .collect()
277    }
278
279    #[test]
280    fn parse_rss_with_3_items() {
281        let items = parse_test_feed(RSS_FEED);
282        assert_eq!(items.len(), 3);
283        assert_eq!(items[0].title.as_deref(), Some("First Post"));
284        assert_eq!(items[0].link.as_deref(), Some("https://example.com/post/1"));
285        assert!(items[0].published.is_some());
286        assert_eq!(items[0].summary.as_deref(), Some("Summary of first post"));
287    }
288
289    #[test]
290    fn parse_atom_with_authors() {
291        let items = parse_test_feed(ATOM_FEED);
292        assert_eq!(items.len(), 2);
293        assert_eq!(items[0].title.as_deref(), Some("Atom Entry One"));
294        assert_eq!(items[0].authors, vec!["Alice".to_string()]);
295        assert_eq!(items[0].categories, vec!["rust".to_string()]);
296        assert_eq!(items[0].link.as_deref(), Some("https://example.com/atom/1"));
297    }
298
299    #[test]
300    fn filter_by_since_date() {
301        let mut items = parse_test_feed(RSS_FEED);
302        // Keep only items published in March 2026 or later
303        items.retain(|item| {
304            item.published
305                .as_deref()
306                .is_some_and(|pub_date| pub_date >= "2026-03-01")
307        });
308        assert_eq!(items.len(), 1);
309        assert_eq!(items[0].title.as_deref(), Some("First Post"));
310    }
311
312    #[test]
313    fn filter_by_categories() {
314        let mut items = parse_test_feed(RSS_FEED);
315        let filter_cats = vec!["tech"];
316        items.retain(|item| {
317            item.categories
318                .iter()
319                .any(|c| filter_cats.contains(&c.as_str()))
320        });
321        assert_eq!(items.len(), 2);
322        assert_eq!(items[0].title.as_deref(), Some("First Post"));
323        assert_eq!(items[1].title.as_deref(), Some("Third Post"));
324    }
325
326    #[test]
327    fn empty_feed_returns_empty_array() {
328        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
329<rss version="2.0">
330  <channel>
331    <title>Empty Feed</title>
332  </channel>
333</rss>"#;
334        let items = parse_test_feed(xml);
335        assert!(items.is_empty());
336    }
337
338    #[test]
339    fn malformed_feed_returns_error() {
340        let bad = b"<not-a-feed><broken";
341        let result = parser::parse(&bad[..]);
342        assert!(result.is_err());
343    }
344
345    #[test]
346    fn limit_truncates_items() {
347        let mut items = parse_test_feed(RSS_FEED);
348        assert_eq!(items.len(), 3);
349        items.truncate(2);
350        assert_eq!(items.len(), 2);
351    }
352
353    #[test]
354    fn rss_items_have_ids() {
355        let items = parse_test_feed(RSS_FEED);
356        assert!(!items[0].id.is_empty());
357        assert!(!items[1].id.is_empty());
358        assert!(!items[2].id.is_empty());
359    }
360
361    #[test]
362    fn atom_feed_has_categories() {
363        let items = parse_test_feed(ATOM_FEED);
364        assert_eq!(items[0].categories, vec!["rust"]);
365        assert!(items[1].categories.is_empty());
366    }
367}