Skip to main content

rss_fetch/
lib.rs

1use std::collections::HashMap;
2use std::sync::Arc;
3
4use tokio::sync::Mutex;
5
6/// Maximum feed response size (10 MB).
7const MAX_FEED_SIZE: u64 = 10 * 1024 * 1024;
8
9/// Maximum article response size (5 MB).
10const MAX_ARTICLE_SIZE: u64 = 5 * 1024 * 1024;
11
12/// Default maximum number of entries returned by "list" action.
13pub const DEFAULT_LIST_LIMIT: usize = 20;
14
15/// A single RSS/Atom feed entry with an assigned numeric ID.
16#[derive(Debug, Clone)]
17pub struct FeedEntry {
18    pub id: u32,
19    pub title: String,
20    pub url: String,
21    pub published: String,
22    pub summary: String,
23}
24
25/// Shared store that maps numeric IDs to article URLs.
26/// Persists for the lifetime of the MCP server process.
27#[derive(Debug, Clone, Default)]
28pub struct EntryStore {
29    inner: Arc<Mutex<StoreInner>>,
30}
31
32#[derive(Debug, Default)]
33struct StoreInner {
34    next_id: u32,
35    entries: HashMap<u32, StoredEntry>,
36}
37
38#[derive(Debug, Clone)]
39struct StoredEntry {
40    url: String,
41    title: String,
42}
43
44impl EntryStore {
45    pub fn new() -> Self {
46        Self::default()
47    }
48
49    /// Clear previous entries and store new ones.
50    /// Returns the list of entries with assigned IDs.
51    pub async fn store_entries(&self, entries: Vec<ParsedEntry>) -> anyhow::Result<Vec<FeedEntry>> {
52        let mut inner = self.inner.lock().await;
53        inner.entries.clear();
54        inner.next_id = 1;
55
56        let mut result = Vec::with_capacity(entries.len());
57        for e in entries {
58            let id = inner.next_id;
59            inner.next_id = inner
60                .next_id
61                .checked_add(1)
62                .ok_or_else(|| anyhow::anyhow!("entry ID overflow"))?;
63            inner.entries.insert(
64                id,
65                StoredEntry {
66                    url: e.url.clone(),
67                    title: e.title.clone(),
68                },
69            );
70            result.push(FeedEntry {
71                id,
72                title: e.title,
73                url: e.url,
74                published: e.published,
75                summary: e.summary,
76            });
77        }
78        Ok(result)
79    }
80
81    /// Look up the URL for a given numeric ID.
82    pub async fn get_url(&self, id: u32) -> Option<(String, String)> {
83        let inner = self.inner.lock().await;
84        inner
85            .entries
86            .get(&id)
87            .map(|e| (e.url.clone(), e.title.clone()))
88    }
89}
90
91/// Intermediate parsed entry before ID assignment.
92#[derive(Debug)]
93pub struct ParsedEntry {
94    pub title: String,
95    pub url: String,
96    pub published: String,
97    pub summary: String,
98}
99
100/// Build a shared HTTP client with reasonable defaults.
101pub fn build_http_client() -> reqwest::Result<reqwest::Client> {
102    reqwest::Client::builder()
103        .timeout(std::time::Duration::from_secs(30))
104        .build()
105}
106
107/// Validate that a URL uses http or https scheme.
108fn validate_url(url: &str) -> anyhow::Result<reqwest::Url> {
109    let parsed = reqwest::Url::parse(url).map_err(|e| anyhow::anyhow!("invalid URL: {e}"))?;
110    match parsed.scheme() {
111        "http" | "https" => Ok(parsed),
112        scheme => anyhow::bail!("unsupported URL scheme: {scheme}"),
113    }
114}
115
116/// Check response size against limit via Content-Length header.
117fn check_content_length(resp: &reqwest::Response, limit: u64) -> anyhow::Result<()> {
118    if let Some(len) = resp.content_length() {
119        if len > limit {
120            anyhow::bail!("response too large: {len} bytes (limit: {limit} bytes)");
121        }
122    }
123    Ok(())
124}
125
126/// Fetch and parse an RSS/Atom feed from a URL.
127/// Returns a list of parsed entries (no IDs yet).
128pub async fn fetch_and_parse_feed(
129    client: &reqwest::Client,
130    url: &str,
131) -> anyhow::Result<(String, Vec<ParsedEntry>)> {
132    let validated = validate_url(url)?;
133
134    let resp = client.get(validated).send().await?;
135    check_content_length(&resp, MAX_FEED_SIZE)?;
136    let bytes = resp.bytes().await?;
137    let feed = feed_rs::parser::parse(&bytes[..])?;
138
139    let feed_title = feed
140        .title
141        .map(|t| t.content)
142        .unwrap_or_else(|| "(untitled feed)".to_string());
143
144    let entries = feed
145        .entries
146        .into_iter()
147        .map(|entry| {
148            let title = entry
149                .title
150                .map(|t| t.content)
151                .unwrap_or_else(|| "(no title)".to_string());
152
153            let url = entry
154                .links
155                .first()
156                .map(|l| l.href.clone())
157                .unwrap_or_default();
158
159            let published = entry
160                .published
161                .or(entry.updated)
162                .map(|d| d.format("%Y-%m-%d").to_string())
163                .unwrap_or_else(|| "-".to_string());
164
165            let summary = entry
166                .summary
167                .map(|s| truncate_text(&strip_html_simple(&s.content), 80))
168                .unwrap_or_default();
169
170            ParsedEntry {
171                title,
172                url,
173                published,
174                summary,
175            }
176        })
177        .collect();
178
179    Ok((feed_title, entries))
180}
181
182/// Format a list of feed entries as a Markdown table.
183///
184/// `total` is the total number of entries in the feed (before truncation).
185/// When `total > entries.len()`, a note about truncation is appended.
186pub fn format_entries_as_markdown(feed_title: &str, entries: &[FeedEntry], total: usize) -> String {
187    let mut out = String::with_capacity(entries.len() * 100);
188    out.push_str(&format!("## {feed_title}\n\n"));
189    out.push_str("| # | Title | Date |\n");
190    out.push_str("|---|-------|------|\n");
191
192    for e in entries {
193        let title_display = truncate_text(&e.title, 60);
194        out.push_str(&format!(
195            "| {} | [{}]({}) | {} |\n",
196            e.id, title_display, e.url, e.published
197        ));
198    }
199
200    if total > entries.len() {
201        out.push_str(&format!(
202            "\n*Showing {} of {} articles. Use `limit` to see more. Use `get` with # to read.*",
203            entries.len(),
204            total
205        ));
206    } else {
207        out.push_str(&format!(
208            "\n*{} articles. Use `get` with # to read.*",
209            entries.len()
210        ));
211    }
212    out
213}
214
215/// Fetch an article page and extract its text content.
216pub async fn fetch_article_text(client: &reqwest::Client, url: &str) -> anyhow::Result<String> {
217    let validated = validate_url(url)?;
218
219    let resp = client
220        .get(validated)
221        .header("User-Agent", "rss-fetch-mcp/0.1")
222        .send()
223        .await?;
224    check_content_length(&resp, MAX_ARTICLE_SIZE)?;
225
226    let html = resp.text().await?;
227    let text = html2text::from_read(html.as_bytes(), 80)?;
228
229    Ok(text)
230}
231
232/// Strip HTML tags, skipping content inside `<script>` and `<style>` elements.
233fn strip_html_simple(html: &str) -> String {
234    let mut result = String::with_capacity(html.len());
235    let mut in_tag = false;
236    let mut in_skip = false;
237    let mut tag_buf = String::new();
238
239    for ch in html.chars() {
240        if ch == '<' {
241            in_tag = true;
242            tag_buf.clear();
243            continue;
244        }
245        if ch == '>' {
246            in_tag = false;
247            let tag_name = tag_buf
248                .split_whitespace()
249                .next()
250                .unwrap_or("")
251                .to_lowercase();
252            if tag_name == "script" || tag_name == "style" {
253                in_skip = true;
254            } else if tag_name == "/script" || tag_name == "/style" {
255                in_skip = false;
256            }
257            continue;
258        }
259        if in_tag {
260            tag_buf.push(ch);
261            continue;
262        }
263        if !in_skip {
264            result.push(ch);
265        }
266    }
267    result
268}
269
270/// Truncate text to a max character count, appending "..." if needed.
271fn truncate_text(s: &str, max: usize) -> String {
272    let char_count = s.chars().count();
273    if char_count <= max {
274        s.to_string()
275    } else {
276        let truncated: String = s.chars().take(max.saturating_sub(3)).collect();
277        format!("{truncated}...")
278    }
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn strip_html_basic() {
287        assert_eq!(
288            strip_html_simple("<p>Hello <b>world</b></p>"),
289            "Hello world"
290        );
291    }
292
293    #[test]
294    fn strip_html_empty() {
295        assert_eq!(strip_html_simple(""), "");
296    }
297
298    #[test]
299    fn strip_html_skips_script() {
300        let html = "<p>before</p><script>alert('xss')</script><p>after</p>";
301        assert_eq!(strip_html_simple(html), "beforeafter");
302    }
303
304    #[test]
305    fn strip_html_skips_style() {
306        let html = "<p>text</p><style>body{color:red}</style><p>more</p>";
307        assert_eq!(strip_html_simple(html), "textmore");
308    }
309
310    #[test]
311    fn truncate_short_text() {
312        assert_eq!(truncate_text("hello", 10), "hello");
313    }
314
315    #[test]
316    fn truncate_long_text() {
317        let result = truncate_text("this is a long sentence", 10);
318        assert!(result.ends_with("..."));
319        assert!(result.chars().count() <= 10);
320    }
321
322    #[test]
323    fn truncate_multibyte() {
324        let result = truncate_text("あいうえおかきくけこ", 5);
325        assert!(result.ends_with("..."));
326        assert!(result.chars().count() <= 5);
327    }
328
329    #[test]
330    fn validate_url_accepts_https() {
331        assert!(validate_url("https://example.com/feed.xml").is_ok());
332    }
333
334    #[test]
335    fn validate_url_accepts_http() {
336        assert!(validate_url("http://example.com/feed.xml").is_ok());
337    }
338
339    #[test]
340    fn validate_url_rejects_file() {
341        let err = validate_url("file:///etc/passwd").unwrap_err();
342        assert!(err.to_string().contains("unsupported URL scheme"));
343    }
344
345    #[test]
346    fn validate_url_rejects_ftp() {
347        let err = validate_url("ftp://example.com/data").unwrap_err();
348        assert!(err.to_string().contains("unsupported URL scheme"));
349    }
350
351    #[test]
352    fn validate_url_rejects_invalid() {
353        assert!(validate_url("not a url").is_err());
354    }
355
356    #[test]
357    fn format_markdown_table() {
358        let entries = vec![FeedEntry {
359            id: 1,
360            title: "Test Article".to_string(),
361            url: "https://example.com/1".to_string(),
362            published: "2026-02-14".to_string(),
363            summary: "A test".to_string(),
364        }];
365        let md = format_entries_as_markdown("Test Feed", &entries, 1);
366        assert!(md.contains("| 1 |"));
367        assert!(md.contains("Test Article"));
368        assert!(md.contains("## Test Feed"));
369        assert!(md.contains("1 articles."));
370    }
371
372    #[test]
373    fn format_markdown_table_truncated() {
374        let entries = vec![FeedEntry {
375            id: 1,
376            title: "Article".to_string(),
377            url: "https://example.com/1".to_string(),
378            published: "2026-02-14".to_string(),
379            summary: String::new(),
380        }];
381        let md = format_entries_as_markdown("Feed", &entries, 50);
382        assert!(md.contains("Showing 1 of 50 articles"));
383        assert!(md.contains("Use `limit` to see more"));
384    }
385
386    #[tokio::test]
387    async fn entry_store_roundtrip() {
388        let store = EntryStore::new();
389        let parsed = vec![
390            ParsedEntry {
391                title: "Article A".to_string(),
392                url: "https://a.com".to_string(),
393                published: "2026-01-01".to_string(),
394                summary: "sumA".to_string(),
395            },
396            ParsedEntry {
397                title: "Article B".to_string(),
398                url: "https://b.com".to_string(),
399                published: "2026-01-02".to_string(),
400                summary: "sumB".to_string(),
401            },
402        ];
403
404        let entries = store.store_entries(parsed).await.unwrap();
405        assert_eq!(entries.len(), 2);
406        assert_eq!(entries[0].id, 1);
407        assert_eq!(entries[1].id, 2);
408
409        let (url, title) = store.get_url(1).await.unwrap();
410        assert_eq!(url, "https://a.com");
411        assert_eq!(title, "Article A");
412
413        assert!(store.get_url(99).await.is_none());
414    }
415
416    #[tokio::test]
417    async fn store_clears_on_new_list() {
418        let store = EntryStore::new();
419        let first = vec![ParsedEntry {
420            title: "Old".to_string(),
421            url: "https://old.com".to_string(),
422            published: "-".to_string(),
423            summary: String::new(),
424        }];
425        store.store_entries(first).await.unwrap();
426
427        let second = vec![ParsedEntry {
428            title: "New".to_string(),
429            url: "https://new.com".to_string(),
430            published: "-".to_string(),
431            summary: String::new(),
432        }];
433        store.store_entries(second).await.unwrap();
434
435        let (url, _) = store.get_url(1).await.unwrap();
436        assert_eq!(url, "https://new.com");
437    }
438}