1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use encoding_rs::Encoding;
4use reqwest::Client;
5
6use crate::config::ExtractorMethod;
7
8#[derive(Debug, Clone)]
9pub struct Article {
10 pub title: String,
11 pub url: String,
12 pub published: Option<DateTime<Utc>>,
13 pub feed_url: String,
14 pub feed_name: String,
15 pub extractor: ExtractorMethod,
16 pub read: bool,
17 pub rss_content: Option<String>,
18}
19
20pub async fn extract_content(
24 client: &Client,
25 article_url: &str,
26 method: &ExtractorMethod,
27 width: usize,
28 rss_content: Option<&str>,
29) -> Result<String> {
30 match method {
31 ExtractorMethod::Readability => match fetch_and_extract(client, article_url, width).await {
32 Ok((_title, text)) => Ok(text),
33 Err(e) => match rss_content.filter(|c| !c.is_empty()) {
34 Some(content) => Ok(html_to_text(content, width)),
35 None => Err(e),
36 },
37 },
38 ExtractorMethod::RssContent => {
39 if let Some(html) = rss_content {
40 if !html.is_empty() {
41 return Ok(html_to_text(html, width));
42 }
43 }
44 let (_title, text) = fetch_and_extract(client, article_url, width).await?;
45 Ok(text)
46 }
47 }
48}
49
50pub async fn fetch_and_extract(
52 client: &Client,
53 url: &str,
54 width: usize,
55) -> Result<(String, String)> {
56 let response = client
57 .get(url)
58 .header("User-Agent", "feed-cli/0.1")
59 .send()
60 .await
61 .with_context(|| format!("Failed to fetch: {}", url))?;
62
63 let content_type = response
64 .headers()
65 .get("content-type")
66 .and_then(|v| v.to_str().ok())
67 .map(String::from);
68
69 let bytes = response
70 .bytes()
71 .await
72 .with_context(|| format!("Failed to read response from: {}", url))?;
73
74 let html = decode_html_bytes(&bytes, content_type.as_deref());
75
76 Ok(extract_from_html(&html, width))
77}
78
79pub fn extract_from_html(html: &str, width: usize) -> (String, String) {
81 match readability::Readability::new(html, None) {
82 Ok(mut r) => match r.parse() {
83 Some(article) => {
84 let content = article.content.unwrap_or_default();
85 let text = html2text::from_read(content.as_bytes(), width).unwrap_or_default();
86 (article.title.unwrap_or_default(), text)
87 }
88 None => (String::new(), html_to_text(html, width)),
89 },
90 Err(_) => (String::new(), html_to_text(html, width)),
91 }
92}
93
94pub fn html_to_text(html: &str, width: usize) -> String {
96 html2text::from_read(html.as_bytes(), width).unwrap_or_default()
97}
98
99pub(crate) fn decode_html_bytes(bytes: &[u8], content_type: Option<&str>) -> String {
102 let charset = detect_charset(content_type, bytes);
103
104 if let Some(charset) = charset {
105 if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
106 if encoding != encoding_rs::UTF_8 {
107 let (decoded, _, _) = encoding.decode(bytes);
108 return decoded.into_owned();
109 }
110 }
111 }
112
113 String::from_utf8_lossy(bytes).into_owned()
114}
115
116fn detect_charset(content_type: Option<&str>, bytes: &[u8]) -> Option<String> {
118 if let Some(ct) = content_type {
120 let found = ct.split(';').skip(1).find_map(|param| {
121 let mut parts = param.trim().splitn(2, '=');
122 let key = parts.next()?.trim();
123 let value = parts.next()?.trim().trim_matches('"');
124 key.eq_ignore_ascii_case("charset").then_some(value)
125 });
126 if let Some(charset) = found {
127 return Some(charset.to_string());
128 }
129 }
130
131 let head = &bytes[..bytes.len().min(4096)];
133 let lossy = String::from_utf8_lossy(head);
134 let lower = lossy.to_ascii_lowercase();
135
136 let pos = lower.find("charset")?;
137 let rest = &lossy[pos + 7..];
138 let rest = rest.trim_start_matches(|c: char| c == '=' || c.is_ascii_whitespace());
139 let charset: String = rest
140 .trim_start_matches(['"', '\''])
141 .chars()
142 .take_while(|c| !matches!(c, '"' | '\'' | ';' | '>' | ' '))
143 .collect();
144 (!charset.is_empty()).then_some(charset)
145}
146
147#[cfg(test)]
148mod tests {
149 use super::*;
150
151 #[test]
155 fn test_detect_charset_from_content_type() {
156 let result = detect_charset(Some("text/html; charset=utf-8"), b"");
157 assert_eq!(result, Some("utf-8".to_string()));
158 }
159
160 #[test]
162 fn test_detect_charset_case_insensitive_key() {
163 let result = detect_charset(Some("text/html; Charset=UTF-8"), b"");
164 assert_eq!(result, Some("UTF-8".to_string()));
165 }
166
167 #[test]
169 fn test_detect_charset_quoted_value() {
170 let result = detect_charset(Some("text/html; charset=\"shift_jis\""), b"");
171 assert_eq!(result, Some("shift_jis".to_string()));
172 }
173
174 #[test]
176 fn test_detect_charset_from_meta_tag() {
177 let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
178 let result = detect_charset(None, html);
179 assert_eq!(result, Some("euc-jp".to_string()));
180 }
181
182 #[test]
184 fn test_detect_charset_from_meta_http_equiv() {
185 let html = br#"<html><head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head></html>"#;
186 let result = detect_charset(None, html);
187 assert_eq!(result, Some("iso-8859-1".to_string()));
188 }
189
190 #[test]
192 fn test_detect_charset_none_when_absent() {
193 let result = detect_charset(None, b"<html><body>hello world</body></html>");
194 assert_eq!(result, None);
195 }
196
197 #[test]
199 fn test_detect_charset_content_type_takes_priority() {
200 let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
201 let result = detect_charset(Some("text/html; charset=shift_jis"), html);
202 assert_eq!(result, Some("shift_jis".to_string()));
203 }
204
205 #[test]
209 fn test_decode_utf8_japanese() {
210 let result = decode_html_bytes("こんにちは".as_bytes(), Some("text/html; charset=utf-8"));
211 assert_eq!(result, "こんにちは");
212 }
213
214 #[test]
216 fn test_decode_shift_jis() {
217 let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode("テスト");
218 let result = decode_html_bytes(&bytes, Some("text/html; charset=shift_jis"));
219 assert_eq!(result, "テスト");
220 }
221
222 #[test]
224 fn test_decode_iso_8859_1() {
225 let (bytes, _, _) = encoding_rs::WINDOWS_1252.encode("café");
226 let result = decode_html_bytes(&bytes, Some("text/html; charset=iso-8859-1"));
227 assert_eq!(result, "café");
228 }
229
230 #[test]
232 fn test_decode_euc_jp() {
233 let (bytes, _, _) = encoding_rs::EUC_JP.encode("日本語");
234 let result = decode_html_bytes(&bytes, Some("text/html; charset=euc-jp"));
235 assert_eq!(result, "日本語");
236 }
237
238 #[test]
240 fn test_decode_charset_from_meta_tag() {
241 let html = "<html><head><meta charset=\"shift_jis\"></head><body>テスト</body></html>";
242 let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode(html);
243 let result = decode_html_bytes(&bytes, None);
244 assert!(result.contains("テスト"));
245 }
246
247 #[test]
249 fn test_decode_no_charset_lossy_utf8() {
250 let result = decode_html_bytes(b"plain ascii", None);
251 assert_eq!(result, "plain ascii");
252 }
253
254 #[test]
256 fn test_decode_unknown_encoding_label() {
257 let result = decode_html_bytes(b"hello", Some("text/html; charset=bogus-encoding"));
258 assert_eq!(result, "hello");
259 }
260}