1use anyhow::{bail, Context, Result};
2use chrono::{DateTime, Utc};
3use feed_rs::parser;
4use reqwest::Client;
5use scraper::{Html, Selector};
6use url::Url;
7
8use crate::cache::HttpMetadata;
9
10#[derive(Clone)]
11pub struct RawFeed {
12 pub title: String,
13 pub entries: Vec<RawEntry>,
14 pub etag: Option<String>,
15 pub last_modified: Option<String>,
16}
17
18#[derive(Clone)]
19pub struct RawEntry {
20 pub title: String,
21 pub url: String,
22 pub published: Option<DateTime<Utc>>,
23 pub rss_content: Option<String>,
24}
25
26pub enum FetchResult {
27 Fetched(RawFeed),
28 NotModified,
29}
30
31impl RawFeed {
32 pub fn parse(data: &[u8], etag: Option<String>, last_modified: Option<String>) -> Result<Self> {
34 let feed = parser::parse(data).context("Failed to parse feed")?;
35
36 let title = feed
37 .title
38 .map(|t| t.content)
39 .unwrap_or_else(|| "(untitled)".to_string());
40
41 let entries = feed
42 .entries
43 .into_iter()
44 .map(|entry| {
45 let entry_title = entry
46 .title
47 .map(|t| t.content)
48 .unwrap_or_else(|| "(untitled)".to_string());
49
50 let url = entry
51 .links
52 .first()
53 .map(|l| l.href.clone())
54 .unwrap_or_default();
55
56 let published = entry
57 .published
58 .or(entry.updated)
59 .map(|dt| dt.with_timezone(&Utc));
60
61 let rss_content = entry
62 .content
63 .as_ref()
64 .and_then(|c| c.body.clone())
65 .or_else(|| entry.summary.as_ref().map(|s| s.content.clone()));
66
67 RawEntry {
68 title: entry_title,
69 url,
70 published,
71 rss_content,
72 }
73 })
74 .collect();
75
76 Ok(RawFeed {
77 title,
78 entries,
79 etag,
80 last_modified,
81 })
82 }
83}
84
85pub async fn fetch(client: &Client, url: &str, metadata: &HttpMetadata) -> Result<FetchResult> {
87 let mut request = client.get(url).header("User-Agent", "feed-cli/0.1");
88
89 if let Some(etag) = &metadata.etag {
90 request = request.header("If-None-Match", etag.as_str());
91 }
92 if let Some(lm) = &metadata.last_modified {
93 request = request.header("If-Modified-Since", lm.as_str());
94 }
95
96 let response = request
97 .send()
98 .await
99 .with_context(|| format!("Failed to fetch: {}", url))?;
100
101 if response.status() == reqwest::StatusCode::NOT_MODIFIED {
102 return Ok(FetchResult::NotModified);
103 }
104
105 let etag = response
106 .headers()
107 .get("etag")
108 .and_then(|v| v.to_str().ok())
109 .map(String::from);
110 let last_modified = response
111 .headers()
112 .get("last-modified")
113 .and_then(|v| v.to_str().ok())
114 .map(String::from);
115
116 let bytes = response
117 .bytes()
118 .await
119 .with_context(|| format!("Failed to read response from: {}", url))?;
120
121 let feed = RawFeed::parse(&bytes, etag, last_modified)?;
122
123 Ok(FetchResult::Fetched(feed))
124}
125
126pub fn discover_feed_urls(html: &str, base_url: &str) -> Result<Vec<String>> {
128 let document = Html::parse_document(html);
129 let selector = Selector::parse(
130 r#"link[rel="alternate"][type="application/rss+xml"], link[rel="alternate"][type="application/atom+xml"]"#,
131 )
132 .expect("valid CSS selector");
133
134 let base = Url::parse(base_url).context("Invalid base URL")?;
135
136 let urls: Vec<String> = document
137 .select(&selector)
138 .filter_map(|el| el.value().attr("href"))
139 .filter_map(|href| base.join(href).ok())
140 .map(|u| u.to_string())
141 .collect();
142
143 if urls.is_empty() {
144 bail!("No RSS/Atom feed found at {}", base_url);
145 }
146
147 Ok(urls)
148}
149
150pub async fn resolve_feed_url(client: &Client, url: &str) -> Result<String> {
152 let response = client
153 .get(url)
154 .header("User-Agent", "feed-cli/0.1")
155 .send()
156 .await
157 .with_context(|| format!("Failed to fetch: {}", url))?;
158
159 let content_type = response
160 .headers()
161 .get("content-type")
162 .and_then(|v| v.to_str().ok())
163 .unwrap_or("")
164 .to_string();
165
166 if !is_html_content_type(&content_type) {
167 return Ok(url.to_string());
168 }
169
170 let body = response
171 .text()
172 .await
173 .with_context(|| format!("Failed to read response from: {}", url))?;
174
175 let feed_urls = discover_feed_urls(&body, url)?;
176 feed_urls.into_iter().next().context("No feed URL found")
177}
178
179fn is_html_content_type(content_type: &str) -> bool {
181 content_type
182 .split(';')
183 .next()
184 .map(|ct| ct.trim().eq_ignore_ascii_case("text/html"))
185 .unwrap_or(false)
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191
192 #[test]
193 fn test_is_html_content_type() {
194 assert!(is_html_content_type("text/html"));
195 assert!(is_html_content_type("text/html; charset=utf-8"));
196 assert!(is_html_content_type("TEXT/HTML"));
197 assert!(!is_html_content_type("application/rss+xml"));
198 assert!(!is_html_content_type("application/atom+xml"));
199 assert!(!is_html_content_type("application/xml"));
200 assert!(!is_html_content_type("text/xml"));
201 assert!(!is_html_content_type(""));
202 assert!(!is_html_content_type(" "));
203 assert!(is_html_content_type("text/html ; charset=utf-8"));
204 }
205}