Skip to main content

feed/
article.rs

1use anyhow::{Context, Result};
2use chrono::{DateTime, Utc};
3use encoding_rs::Encoding;
4use reqwest::Client;
5
6use crate::config::ExtractorMethod;
7
8#[derive(Debug, Clone)]
9pub struct Article {
10    pub title: String,
11    pub url: String,
12    pub published: Option<DateTime<Utc>>,
13    pub feed_url: String,
14    pub feed_name: String,
15    pub extractor: ExtractorMethod,
16    pub read: bool,
17    pub rss_content: Option<String>,
18}
19
20// === Article content extraction (free functions) ===
21
22/// Fetch article using specified extraction method, returning just the text content.
23pub async fn extract_content(
24    client: &Client,
25    article_url: &str,
26    method: &ExtractorMethod,
27    width: usize,
28    rss_content: Option<&str>,
29) -> Result<String> {
30    match method {
31        ExtractorMethod::Readability => match fetch_and_extract(client, article_url, width).await {
32            Ok((_title, text)) => Ok(text),
33            Err(e) => match rss_content.filter(|c| !c.is_empty()) {
34                Some(content) => Ok(html_to_text(content, width)),
35                None => Err(e),
36            },
37        },
38        ExtractorMethod::RssContent => {
39            if let Some(html) = rss_content {
40                if !html.is_empty() {
41                    return Ok(html_to_text(html, width));
42                }
43            }
44            let (_title, text) = fetch_and_extract(client, article_url, width).await?;
45            Ok(text)
46        }
47    }
48}
49
50/// Fetch a URL and extract the article content. Returns (title, text).
51pub async fn fetch_and_extract(
52    client: &Client,
53    url: &str,
54    width: usize,
55) -> Result<(String, String)> {
56    let response = client
57        .get(url)
58        .header("User-Agent", "feed-cli/0.1")
59        .send()
60        .await
61        .with_context(|| format!("Failed to fetch: {}", url))?;
62
63    let content_type = response
64        .headers()
65        .get("content-type")
66        .and_then(|v| v.to_str().ok())
67        .map(String::from);
68
69    let bytes = response
70        .bytes()
71        .await
72        .with_context(|| format!("Failed to read response from: {}", url))?;
73
74    let html = decode_html_bytes(&bytes, content_type.as_deref());
75
76    Ok(extract_from_html(&html, width))
77}
78
79/// Extract readable text from raw HTML string. Returns (title, text).
80pub fn extract_from_html(html: &str, width: usize) -> (String, String) {
81    match readability::Readability::new(html, None) {
82        Ok(mut r) => match r.parse() {
83            Some(article) => {
84                let content = article.content.unwrap_or_default();
85                let text = html2text::from_read(content.as_bytes(), width).unwrap_or_default();
86                (article.title.unwrap_or_default(), text)
87            }
88            None => (String::new(), html_to_text(html, width)),
89        },
90        Err(_) => (String::new(), html_to_text(html, width)),
91    }
92}
93
94/// Render HTML (or plain text) to wrapped text without Readability extraction.
95pub fn html_to_text(html: &str, width: usize) -> String {
96    html2text::from_read(html.as_bytes(), width).unwrap_or_default()
97}
98
99/// Decode raw bytes to a UTF-8 string, detecting encoding from Content-Type header
100/// and HTML meta tags. Falls back to UTF-8 (lossy).
101pub(crate) fn decode_html_bytes(bytes: &[u8], content_type: Option<&str>) -> String {
102    let charset = detect_charset(content_type, bytes);
103
104    if let Some(charset) = charset {
105        if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
106            if encoding != encoding_rs::UTF_8 {
107                let (decoded, _, _) = encoding.decode(bytes);
108                return decoded.into_owned();
109            }
110        }
111    }
112
113    String::from_utf8_lossy(bytes).into_owned()
114}
115
116/// Detect charset from Content-Type header or HTML meta tags.
117fn detect_charset(content_type: Option<&str>, bytes: &[u8]) -> Option<String> {
118    // Try Content-Type header first (e.g. "text/html; charset=shift_jis")
119    if let Some(ct) = content_type {
120        let found = ct.split(';').skip(1).find_map(|param| {
121            let mut parts = param.trim().splitn(2, '=');
122            let key = parts.next()?.trim();
123            let value = parts.next()?.trim().trim_matches('"');
124            key.eq_ignore_ascii_case("charset").then_some(value)
125        });
126        if let Some(charset) = found {
127            return Some(charset.to_string());
128        }
129    }
130
131    // Fall back to HTML meta tags in the first few KB
132    let head = &bytes[..bytes.len().min(4096)];
133    let lossy = String::from_utf8_lossy(head);
134    let lower = lossy.to_ascii_lowercase();
135
136    let pos = lower.find("charset")?;
137    let rest = &lossy[pos + 7..];
138    let rest = rest.trim_start_matches(|c: char| c == '=' || c.is_ascii_whitespace());
139    let charset: String = rest
140        .trim_start_matches(['"', '\''])
141        .chars()
142        .take_while(|c| !matches!(c, '"' | '\'' | ';' | '>' | ' '))
143        .collect();
144    (!charset.is_empty()).then_some(charset)
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    // --- detect_charset tests ---
152
153    // charset is extracted from the Content-Type header value.
154    #[test]
155    fn test_detect_charset_from_content_type() {
156        let result = detect_charset(Some("text/html; charset=utf-8"), b"");
157        assert_eq!(result, Some("utf-8".to_string()));
158    }
159
160    // "Charset" (uppercase C) is treated the same as "charset".
161    #[test]
162    fn test_detect_charset_case_insensitive_key() {
163        let result = detect_charset(Some("text/html; Charset=UTF-8"), b"");
164        assert_eq!(result, Some("UTF-8".to_string()));
165    }
166
167    // Quotes around the charset value are stripped (e.g. charset="shift_jis").
168    #[test]
169    fn test_detect_charset_quoted_value() {
170        let result = detect_charset(Some("text/html; charset=\"shift_jis\""), b"");
171        assert_eq!(result, Some("shift_jis".to_string()));
172    }
173
174    // charset is detected from an HTML <meta charset="..."> tag when no header is present.
175    #[test]
176    fn test_detect_charset_from_meta_tag() {
177        let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
178        let result = detect_charset(None, html);
179        assert_eq!(result, Some("euc-jp".to_string()));
180    }
181
182    // charset is detected from a <meta http-equiv="Content-Type"> tag.
183    #[test]
184    fn test_detect_charset_from_meta_http_equiv() {
185        let html = br#"<html><head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head></html>"#;
186        let result = detect_charset(None, html);
187        assert_eq!(result, Some("iso-8859-1".to_string()));
188    }
189
190    // None is returned when neither header nor meta tag specifies a charset.
191    #[test]
192    fn test_detect_charset_none_when_absent() {
193        let result = detect_charset(None, b"<html><body>hello world</body></html>");
194        assert_eq!(result, None);
195    }
196
197    // Content-Type header wins over a conflicting <meta charset> in the HTML body.
198    #[test]
199    fn test_detect_charset_content_type_takes_priority() {
200        let html = br#"<html><head><meta charset="euc-jp"></head></html>"#;
201        let result = detect_charset(Some("text/html; charset=shift_jis"), html);
202        assert_eq!(result, Some("shift_jis".to_string()));
203    }
204
205    // --- decode_html_bytes tests ---
206
207    // UTF-8 Japanese text passes through unchanged when charset=utf-8 is specified.
208    #[test]
209    fn test_decode_utf8_japanese() {
210        let result = decode_html_bytes("こんにちは".as_bytes(), Some("text/html; charset=utf-8"));
211        assert_eq!(result, "こんにちは");
212    }
213
214    // Shift_JIS encoded bytes are correctly decoded to UTF-8.
215    #[test]
216    fn test_decode_shift_jis() {
217        let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode("テスト");
218        let result = decode_html_bytes(&bytes, Some("text/html; charset=shift_jis"));
219        assert_eq!(result, "テスト");
220    }
221
222    // ISO-8859-1 (Latin-1) encoded bytes are correctly decoded to UTF-8.
223    #[test]
224    fn test_decode_iso_8859_1() {
225        let (bytes, _, _) = encoding_rs::WINDOWS_1252.encode("café");
226        let result = decode_html_bytes(&bytes, Some("text/html; charset=iso-8859-1"));
227        assert_eq!(result, "café");
228    }
229
230    // EUC-JP encoded bytes are correctly decoded to UTF-8.
231    #[test]
232    fn test_decode_euc_jp() {
233        let (bytes, _, _) = encoding_rs::EUC_JP.encode("日本語");
234        let result = decode_html_bytes(&bytes, Some("text/html; charset=euc-jp"));
235        assert_eq!(result, "日本語");
236    }
237
238    // Charset is auto-detected from <meta charset> when no Content-Type header is given.
239    #[test]
240    fn test_decode_charset_from_meta_tag() {
241        let html = "<html><head><meta charset=\"shift_jis\"></head><body>テスト</body></html>";
242        let (bytes, _, _) = encoding_rs::SHIFT_JIS.encode(html);
243        let result = decode_html_bytes(&bytes, None);
244        assert!(result.contains("テスト"));
245    }
246
247    // Bytes without any charset hint fall back to UTF-8 (lossy).
248    #[test]
249    fn test_decode_no_charset_lossy_utf8() {
250        let result = decode_html_bytes(b"plain ascii", None);
251        assert_eq!(result, "plain ascii");
252    }
253
254    // An unrecognized encoding label falls through to UTF-8 lossy instead of failing.
255    #[test]
256    fn test_decode_unknown_encoding_label() {
257        let result = decode_html_bytes(b"hello", Some("text/html; charset=bogus-encoding"));
258        assert_eq!(result, "hello");
259    }
260}