Skip to main content

feed/
feed_source.rs

1use anyhow::{bail, Context, Result};
2use chrono::{DateTime, Utc};
3use feed_rs::parser;
4use reqwest::Client;
5use scraper::{Html, Selector};
6use url::Url;
7
8use crate::cache::HttpMetadata;
9
10#[derive(Clone)]
11pub struct RawFeed {
12    pub title: String,
13    pub entries: Vec<RawEntry>,
14    pub etag: Option<String>,
15    pub last_modified: Option<String>,
16}
17
18#[derive(Clone)]
19pub struct RawEntry {
20    pub title: String,
21    pub url: String,
22    pub published: Option<DateTime<Utc>>,
23    pub rss_content: Option<String>,
24}
25
26pub enum FetchResult {
27    Fetched(RawFeed),
28    NotModified,
29}
30
31impl RawFeed {
32    /// Parse RSS/Atom feed from bytes.
33    pub fn parse(data: &[u8], etag: Option<String>, last_modified: Option<String>) -> Result<Self> {
34        let feed = parser::parse(data).context("Failed to parse feed")?;
35
36        let title = feed
37            .title
38            .map(|t| t.content)
39            .unwrap_or_else(|| "(untitled)".to_string());
40
41        let entries = feed
42            .entries
43            .into_iter()
44            .map(|entry| {
45                let entry_title = entry
46                    .title
47                    .map(|t| t.content)
48                    .unwrap_or_else(|| "(untitled)".to_string());
49
50                let url = entry
51                    .links
52                    .first()
53                    .map(|l| l.href.clone())
54                    .unwrap_or_default();
55
56                let published = entry
57                    .published
58                    .or(entry.updated)
59                    .map(|dt| dt.with_timezone(&Utc));
60
61                let rss_content = entry
62                    .content
63                    .as_ref()
64                    .and_then(|c| c.body.clone())
65                    .or_else(|| entry.summary.as_ref().map(|s| s.content.clone()));
66
67                RawEntry {
68                    title: entry_title,
69                    url,
70                    published,
71                    rss_content,
72                }
73            })
74            .collect();
75
76        Ok(RawFeed {
77            title,
78            entries,
79            etag,
80            last_modified,
81        })
82    }
83}
84
85/// Fetch a feed with conditional GET support.
86pub async fn fetch(client: &Client, url: &str, metadata: &HttpMetadata) -> Result<FetchResult> {
87    let mut request = client.get(url).header("User-Agent", "feed-cli/0.1");
88
89    if let Some(etag) = &metadata.etag {
90        request = request.header("If-None-Match", etag.as_str());
91    }
92    if let Some(lm) = &metadata.last_modified {
93        request = request.header("If-Modified-Since", lm.as_str());
94    }
95
96    let response = request
97        .send()
98        .await
99        .with_context(|| format!("Failed to fetch: {}", url))?;
100
101    if response.status() == reqwest::StatusCode::NOT_MODIFIED {
102        return Ok(FetchResult::NotModified);
103    }
104
105    let etag = response
106        .headers()
107        .get("etag")
108        .and_then(|v| v.to_str().ok())
109        .map(String::from);
110    let last_modified = response
111        .headers()
112        .get("last-modified")
113        .and_then(|v| v.to_str().ok())
114        .map(String::from);
115
116    let bytes = response
117        .bytes()
118        .await
119        .with_context(|| format!("Failed to read response from: {}", url))?;
120
121    let feed = RawFeed::parse(&bytes, etag, last_modified)?;
122
123    Ok(FetchResult::Fetched(feed))
124}
125
126/// Discover RSS/Atom feed URLs from HTML via autodiscovery.
127pub fn discover_feed_urls(html: &str, base_url: &str) -> Result<Vec<String>> {
128    let document = Html::parse_document(html);
129    let selector = Selector::parse(
130        r#"link[rel="alternate"][type="application/rss+xml"], link[rel="alternate"][type="application/atom+xml"]"#,
131    )
132    .expect("valid CSS selector");
133
134    let base = Url::parse(base_url).context("Invalid base URL")?;
135
136    let urls: Vec<String> = document
137        .select(&selector)
138        .filter_map(|el| el.value().attr("href"))
139        .filter_map(|href| base.join(href).ok())
140        .map(|u| u.to_string())
141        .collect();
142
143    if urls.is_empty() {
144        bail!("No RSS/Atom feed found at {}", base_url);
145    }
146
147    Ok(urls)
148}
149
150/// Resolve a URL to a feed URL (follows HTML autodiscovery if needed).
151pub async fn resolve_feed_url(client: &Client, url: &str) -> Result<String> {
152    let response = client
153        .get(url)
154        .header("User-Agent", "feed-cli/0.1")
155        .send()
156        .await
157        .with_context(|| format!("Failed to fetch: {}", url))?;
158
159    let content_type = response
160        .headers()
161        .get("content-type")
162        .and_then(|v| v.to_str().ok())
163        .unwrap_or("")
164        .to_string();
165
166    if !is_html_content_type(&content_type) {
167        return Ok(url.to_string());
168    }
169
170    let body = response
171        .text()
172        .await
173        .with_context(|| format!("Failed to read response from: {}", url))?;
174
175    let feed_urls = discover_feed_urls(&body, url)?;
176    feed_urls.into_iter().next().context("No feed URL found")
177}
178
179/// Returns true if the Content-Type header value indicates HTML content.
180fn is_html_content_type(content_type: &str) -> bool {
181    content_type
182        .split(';')
183        .next()
184        .map(|ct| ct.trim().eq_ignore_ascii_case("text/html"))
185        .unwrap_or(false)
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    // is_html_content_type returns true only for "text/html" (case-insensitive, ignoring params).
193    #[test]
194    fn test_is_html_content_type() {
195        assert!(is_html_content_type("text/html"));
196        assert!(is_html_content_type("text/html; charset=utf-8"));
197        assert!(is_html_content_type("TEXT/HTML"));
198        assert!(!is_html_content_type("application/rss+xml"));
199        assert!(!is_html_content_type("application/atom+xml"));
200        assert!(!is_html_content_type("application/xml"));
201        assert!(!is_html_content_type("text/xml"));
202        assert!(!is_html_content_type(""));
203        assert!(!is_html_content_type("  "));
204        assert!(is_html_content_type("text/html ; charset=utf-8"));
205    }
206}