feed/
article.rs

1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use encoding_rs::Encoding;
4use reqwest::Client;
5
6use crate::config::ExtractorMethod;
7
8#[derive(Debug, Clone)]
9pub struct Article {
10    pub title: String,
11    pub url: String,
12    pub published: Option<DateTime<Utc>>,
13    pub feed_url: String,
14    pub feed_name: String,
15    pub extractor: ExtractorMethod,
16    pub read: bool,
17    pub rss_content: Option<String>,
18}
19
20// === Article content extraction (free functions) ===
21
22/// Fetch article using specified extraction method, returning just the text content.
23pub async fn extract_content(
24    client: &Client,
25    article_url: &str,
26    method: &ExtractorMethod,
27    width: usize,
28    rss_content: Option<&str>,
29) -> Result<String> {
30    match method {
31        ExtractorMethod::Readability => match fetch_and_extract(client, article_url, width).await {
32            Ok((_title, text)) => Ok(text),
33            Err(e) => match rss_content.filter(|c| !c.is_empty()) {
34                Some(content) => Ok(html_to_text(content, width)),
35                None => Err(e),
36            },
37        },
38        ExtractorMethod::RssContent => {
39            if let Some(html) = rss_content {
40                if !html.is_empty() {
41                    return Ok(html_to_text(html, width));
42                }
43            }
44            let (_title, text) = fetch_and_extract(client, article_url, width).await?;
45            Ok(text)
46        }
47    }
48}
49
50/// Fetch a URL and extract the article content. Returns (title, text).
51pub async fn fetch_and_extract(
52    client: &Client,
53    url: &str,
54    width: usize,
55) -> Result<(String, String)> {
56    let response = client
57        .get(url)
58        .header("User-Agent", "feed-cli/0.1")
59        .send()
60        .await
61        .with_context(|| format!("Failed to fetch: {}", url))?;
62
63    let content_type = response
64        .headers()
65        .get("content-type")
66        .and_then(|v| v.to_str().ok())
67        .map(String::from);
68
69    let bytes = response
70        .bytes()
71        .await
72        .with_context(|| format!("Failed to read response from: {}", url))?;
73
74    let html = decode_html_bytes(&bytes, content_type.as_deref());
75
76    Ok(extract_from_html(&html, width))
77}
78
79/// Extract readable text from raw HTML string. Returns (title, text).
80pub fn extract_from_html(html: &str, width: usize) -> (String, String) {
81    match readability::Readability::new(html, None) {
82        Ok(mut r) => match r.parse() {
83            Some(article) => {
84                let content = article.content.unwrap_or_default();
85                let text = html2text::from_read(content.as_bytes(), width).unwrap_or_default();
86                (article.title.unwrap_or_default(), text)
87            }
88            None => (String::new(), html_to_text(html, width)),
89        },
90        Err(_) => (String::new(), html_to_text(html, width)),
91    }
92}
93
94/// Render HTML (or plain text) to wrapped text without Readability extraction.
95pub fn html_to_text(html: &str, width: usize) -> String {
96    html2text::from_read(html.as_bytes(), width).unwrap_or_default()
97}
98
99/// Decode raw bytes to a UTF-8 string, detecting encoding from Content-Type header
100/// and HTML meta tags. Falls back to UTF-8 (lossy).
101pub(crate) fn decode_html_bytes(bytes: &[u8], content_type: Option<&str>) -> String {
102    let charset = detect_charset(content_type, bytes);
103
104    if let Some(charset) = charset {
105        if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
106            if encoding != encoding_rs::UTF_8 {
107                let (decoded, _, _) = encoding.decode(bytes);
108                return decoded.into_owned();
109            }
110        }
111    }
112
113    String::from_utf8_lossy(bytes).into_owned()
114}
115
116/// Detect charset from Content-Type header or HTML meta tags.
117fn detect_charset(content_type: Option<&str>, bytes: &[u8]) -> Option<String> {
118    // Try Content-Type header first (e.g. "text/html; charset=shift_jis")
119    if let Some(ct) = content_type {
120        let found = ct.split(';').skip(1).find_map(|param| {
121            let mut parts = param.trim().splitn(2, '=');
122            let key = parts.next()?.trim();
123            let value = parts.next()?.trim().trim_matches('"');
124            key.eq_ignore_ascii_case("charset").then_some(value)
125        });
126        if let Some(charset) = found {
127            return Some(charset.to_string());
128        }
129    }
130
131    // Fall back to HTML meta tags in the first few KB
132    let head = &bytes[..bytes.len().min(4096)];
133    let lossy = String::from_utf8_lossy(head);
134    let lower = lossy.to_ascii_lowercase();
135
136    let pos = lower.find("charset")?;
137    let rest = &lossy[pos + 7..];
138    let rest = rest.trim_start_matches(|c: char| c == '=' || c.is_ascii_whitespace());
139    let charset: String = rest
140        .trim_start_matches(['"', '\''])
141        .chars()
142        .take_while(|c| !matches!(c, '"' | '\'' | ';' | '>' | ' '))
143        .collect();
144    (!charset.is_empty()).then_some(charset)
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    // --- detect_charset tests ---
152
153    #[test]
154    fn test_detect_charset_from_content_type() {
155        let result = detect_charset(Some("text/html; charset=utf-8"), b"");
156        assert_eq!(result, Some("utf-8".to_string()));
157    }
158
159    #[test]
160    fn test_detect_charset_case_insensitive_key() {
161        let result = detect_charset(Some("text/html; Charset=UTF-8"), b"");
162        assert_eq!(result, Some("UTF-8".to_string()));
163    }
164
165    #[test]
166    fn test_detect_charset_quoted_value() {
167        let result = detect_charset(Some("text/html; charset=\"shift_jis\""), b"");
168        assert_eq!(result, Some("shift_jis".to_string()));
169    }
170
171    #[test]
172    fn test_detect_charset_from_meta_tag() {
173        let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
174        let result = detect_charset(None, html);
175        assert_eq!(result, Some("euc-jp".to_string()));
176    }
177
178    #[test]
179    fn test_detect_charset_from_meta_http_equiv() {
180        let html = br#"<html><head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head></html>"#;
181        let result = detect_charset(None, html);
182        assert_eq!(result, Some("iso-8859-1".to_string()));
183    }
184
185    #[test]
186    fn test_detect_charset_none_when_absent() {
187        let result = detect_charset(None, b"<html><body>hello world</body></html>");
188        assert_eq!(result, None);
189    }
190
191    #[test]
192    fn test_detect_charset_content_type_takes_priority() {
193        let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
194        let result = detect_charset(Some("text/html; charset=shift_jis"), html);
195        assert_eq!(result, Some("shift_jis".to_string()));
196    }
197
198    #[test]
199    fn test_detect_charset_empty_content_type() {
200        let result = detect_charset(Some(""), b"<html></html>");
201        assert_eq!(result, None);
202    }
203
204    #[test]
205    fn test_detect_charset_no_equals() {
206        let result = detect_charset(Some("text/html; charset"), b"");
207        assert_eq!(result, None);
208    }
209
210    // --- decode_html_bytes tests ---
211
212    #[test]
213    fn test_decode_utf8_default() {
214        let result = decode_html_bytes("Hello, world!".as_bytes(), None);
215        assert_eq!(result, "Hello, world!");
216    }
217
218    #[test]
219    fn test_decode_utf8_japanese() {
220        let result = decode_html_bytes("こんにちは".as_bytes(), Some("text/html; charset=utf-8"));
221        assert_eq!(result, "こんにちは");
222    }
223
224    #[test]
225    fn test_decode_shift_jis() {
226        let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode("テスト");
227        let result = decode_html_bytes(&bytes, Some("text/html; charset=shift_jis"));
228        assert_eq!(result, "テスト");
229    }
230
231    #[test]
232    fn test_decode_iso_8859_1() {
233        let (bytes, _, _) = encoding_rs::WINDOWS_1252.encode("café");
234        let result = decode_html_bytes(&bytes, Some("text/html; charset=iso-8859-1"));
235        assert_eq!(result, "café");
236    }
237
238    #[test]
239    fn test_decode_euc_jp() {
240        let (bytes, _, _) = encoding_rs::EUC_JP.encode("日本語");
241        let result = decode_html_bytes(&bytes, Some("text/html; charset=euc-jp"));
242        assert_eq!(result, "日本語");
243    }
244
245    #[test]
246    fn test_decode_charset_from_meta_tag() {
247        let html = "<html><head><meta charset=\"shift_jis\"></head><body>テスト</body></html>";
248        let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode(html);
249        let result = decode_html_bytes(&bytes, None);
250        assert!(result.contains("テスト"));
251    }
252
253    #[test]
254    fn test_decode_no_charset_lossy_utf8() {
255        let result = decode_html_bytes(b"plain ascii", None);
256        assert_eq!(result, "plain ascii");
257    }
258
259    #[test]
260    fn test_decode_unknown_encoding_label() {
261        // Unknown encoding label should fall through to UTF-8 lossy
262        let result = decode_html_bytes(b"hello", Some("text/html; charset=bogus-encoding"));
263        assert_eq!(result, "hello");
264    }
265}
feed/article.rs

feed/
article.rs