1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use encoding_rs::Encoding;
4use reqwest::Client;
5
6use crate::config::ExtractorMethod;
7
8#[derive(Debug, Clone)]
9pub struct Article {
10 pub title: String,
11 pub url: String,
12 pub published: Option<DateTime<Utc>>,
13 pub feed_url: String,
14 pub feed_name: String,
15 pub extractor: ExtractorMethod,
16 pub read: bool,
17 pub rss_content: Option<String>,
18}
19
20pub async fn extract_content(
24 client: &Client,
25 article_url: &str,
26 method: &ExtractorMethod,
27 width: usize,
28 rss_content: Option<&str>,
29) -> Result<String> {
30 match method {
31 ExtractorMethod::Readability => match fetch_and_extract(client, article_url, width).await {
32 Ok((_title, text)) => Ok(text),
33 Err(e) => match rss_content.filter(|c| !c.is_empty()) {
34 Some(content) => Ok(html_to_text(content, width)),
35 None => Err(e),
36 },
37 },
38 ExtractorMethod::RssContent => {
39 if let Some(html) = rss_content {
40 if !html.is_empty() {
41 return Ok(html_to_text(html, width));
42 }
43 }
44 let (_title, text) = fetch_and_extract(client, article_url, width).await?;
45 Ok(text)
46 }
47 }
48}
49
50pub async fn fetch_and_extract(
52 client: &Client,
53 url: &str,
54 width: usize,
55) -> Result<(String, String)> {
56 let response = client
57 .get(url)
58 .header("User-Agent", "feed-cli/0.1")
59 .send()
60 .await
61 .with_context(|| format!("Failed to fetch: {}", url))?;
62
63 let content_type = response
64 .headers()
65 .get("content-type")
66 .and_then(|v| v.to_str().ok())
67 .map(String::from);
68
69 let bytes = response
70 .bytes()
71 .await
72 .with_context(|| format!("Failed to read response from: {}", url))?;
73
74 let html = decode_html_bytes(&bytes, content_type.as_deref());
75
76 Ok(extract_from_html(&html, width))
77}
78
79pub fn extract_from_html(html: &str, width: usize) -> (String, String) {
81 match readability::Readability::new(html, None) {
82 Ok(mut r) => match r.parse() {
83 Some(article) => {
84 let content = article.content.unwrap_or_default();
85 let text = html2text::from_read(content.as_bytes(), width).unwrap_or_default();
86 (article.title.unwrap_or_default(), text)
87 }
88 None => (String::new(), html_to_text(html, width)),
89 },
90 Err(_) => (String::new(), html_to_text(html, width)),
91 }
92}
93
94pub fn html_to_text(html: &str, width: usize) -> String {
96 html2text::from_read(html.as_bytes(), width).unwrap_or_default()
97}
98
99pub(crate) fn decode_html_bytes(bytes: &[u8], content_type: Option<&str>) -> String {
102 let charset = detect_charset(content_type, bytes);
103
104 if let Some(charset) = charset {
105 if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
106 if encoding != encoding_rs::UTF_8 {
107 let (decoded, _, _) = encoding.decode(bytes);
108 return decoded.into_owned();
109 }
110 }
111 }
112
113 String::from_utf8_lossy(bytes).into_owned()
114}
115
116fn detect_charset(content_type: Option<&str>, bytes: &[u8]) -> Option<String> {
118 if let Some(ct) = content_type {
120 let found = ct.split(';').skip(1).find_map(|param| {
121 let mut parts = param.trim().splitn(2, '=');
122 let key = parts.next()?.trim();
123 let value = parts.next()?.trim().trim_matches('"');
124 key.eq_ignore_ascii_case("charset").then_some(value)
125 });
126 if let Some(charset) = found {
127 return Some(charset.to_string());
128 }
129 }
130
131 let head = &bytes[..bytes.len().min(4096)];
133 let lossy = String::from_utf8_lossy(head);
134 let lower = lossy.to_ascii_lowercase();
135
136 let pos = lower.find("charset")?;
137 let rest = &lossy[pos + 7..];
138 let rest = rest.trim_start_matches(|c: char| c == '=' || c.is_ascii_whitespace());
139 let charset: String = rest
140 .trim_start_matches(['"', '\''])
141 .chars()
142 .take_while(|c| !matches!(c, '"' | '\'' | ';' | '>' | ' '))
143 .collect();
144 (!charset.is_empty()).then_some(charset)
145}
146
147#[cfg(test)]
148mod tests {
149 use super::*;
150
151 #[test]
154 fn test_detect_charset_from_content_type() {
155 let result = detect_charset(Some("text/html; charset=utf-8"), b"");
156 assert_eq!(result, Some("utf-8".to_string()));
157 }
158
159 #[test]
160 fn test_detect_charset_case_insensitive_key() {
161 let result = detect_charset(Some("text/html; Charset=UTF-8"), b"");
162 assert_eq!(result, Some("UTF-8".to_string()));
163 }
164
165 #[test]
166 fn test_detect_charset_quoted_value() {
167 let result = detect_charset(Some("text/html; charset=\"shift_jis\""), b"");
168 assert_eq!(result, Some("shift_jis".to_string()));
169 }
170
171 #[test]
172 fn test_detect_charset_from_meta_tag() {
173 let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
174 let result = detect_charset(None, html);
175 assert_eq!(result, Some("euc-jp".to_string()));
176 }
177
178 #[test]
179 fn test_detect_charset_from_meta_http_equiv() {
180 let html = br#"<html><head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head></html>"#;
181 let result = detect_charset(None, html);
182 assert_eq!(result, Some("iso-8859-1".to_string()));
183 }
184
185 #[test]
186 fn test_detect_charset_none_when_absent() {
187 let result = detect_charset(None, b"<html><body>hello world</body></html>");
188 assert_eq!(result, None);
189 }
190
191 #[test]
192 fn test_detect_charset_content_type_takes_priority() {
193 let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
194 let result = detect_charset(Some("text/html; charset=shift_jis"), html);
195 assert_eq!(result, Some("shift_jis".to_string()));
196 }
197
198 #[test]
199 fn test_detect_charset_empty_content_type() {
200 let result = detect_charset(Some(""), b"<html></html>");
201 assert_eq!(result, None);
202 }
203
204 #[test]
205 fn test_detect_charset_no_equals() {
206 let result = detect_charset(Some("text/html; charset"), b"");
207 assert_eq!(result, None);
208 }
209
210 #[test]
213 fn test_decode_utf8_default() {
214 let result = decode_html_bytes("Hello, world!".as_bytes(), None);
215 assert_eq!(result, "Hello, world!");
216 }
217
218 #[test]
219 fn test_decode_utf8_japanese() {
220 let result = decode_html_bytes("こんにちは".as_bytes(), Some("text/html; charset=utf-8"));
221 assert_eq!(result, "こんにちは");
222 }
223
224 #[test]
225 fn test_decode_shift_jis() {
226 let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode("テスト");
227 let result = decode_html_bytes(&bytes, Some("text/html; charset=shift_jis"));
228 assert_eq!(result, "テスト");
229 }
230
231 #[test]
232 fn test_decode_iso_8859_1() {
233 let (bytes, _, _) = encoding_rs::WINDOWS_1252.encode("café");
234 let result = decode_html_bytes(&bytes, Some("text/html; charset=iso-8859-1"));
235 assert_eq!(result, "café");
236 }
237
238 #[test]
239 fn test_decode_euc_jp() {
240 let (bytes, _, _) = encoding_rs::EUC_JP.encode("日本語");
241 let result = decode_html_bytes(&bytes, Some("text/html; charset=euc-jp"));
242 assert_eq!(result, "日本語");
243 }
244
245 #[test]
246 fn test_decode_charset_from_meta_tag() {
247 let html = "<html><head><meta charset=\"shift_jis\"></head><body>テスト</body></html>";
248 let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode(html);
249 let result = decode_html_bytes(&bytes, None);
250 assert!(result.contains("テスト"));
251 }
252
253 #[test]
254 fn test_decode_no_charset_lossy_utf8() {
255 let result = decode_html_bytes(b"plain ascii", None);
256 assert_eq!(result, "plain ascii");
257 }
258
259 #[test]
260 fn test_decode_unknown_encoding_label() {
261 let result = decode_html_bytes(b"hello", Some("text/html; charset=bogus-encoding"));
263 assert_eq!(result, "hello");
264 }
265}