Skip to main content

stillo_core/
extractor.rs

1pub mod readability;
2pub mod spa_detection;
3
4use html5ever::parse_document;
5use html5ever::tendril::TendrilSink;
6use markup5ever_rcdom::{NodeData, RcDom};
7use url::Url;
8
9use crate::document::{ExtractedContent, RawHtml, SpaDetection};
10use self::readability::ReadabilityExtractor;
11use self::spa_detection::{detect_spa, extract_text_length};
12
13#[derive(Debug, Clone)]
14pub struct ExtractorConfig {
15    pub min_content_length: usize,
16    pub noise_selectors: Vec<String>,
17    pub preserve_links: bool,
18}
19
20impl Default for ExtractorConfig {
21    fn default() -> Self {
22        Self {
23            min_content_length: 500,
24            noise_selectors: vec![],
25            preserve_links: true,
26        }
27    }
28}
29
30#[derive(Debug, thiserror::Error)]
31pub enum ExtractionError {
32    #[error("Failed to decode HTML: {0}")]
33    Decode(String),
34    #[error("No content found")]
35    NoContent,
36}
37
38pub struct ContentExtractor {
39    config: ExtractorConfig,
40}
41
42impl ContentExtractor {
43    pub fn new(config: ExtractorConfig) -> Self {
44        Self { config }
45    }
46
47    /// RawHtml → ExtractedContent(純粋関数)
48    pub fn extract(&self, raw: &RawHtml) -> Result<ExtractedContent, ExtractionError> {
49        let html_str = decode_bytes(raw);
50        let dom = parse_html(&html_str);
51        let root = dom.document.clone();
52
53        let text_len = extract_text_length(&root);
54        let _spa = detect_spa(&root, text_len, self.config.min_content_length);
55
56        let extractor = ReadabilityExtractor {
57            preserve_links: self.config.preserve_links,
58        };
59        let content = extractor.extract(&root, &raw.url);
60
61        Ok(content)
62    }
63
64    pub fn detect_spa_for(&self, raw: &RawHtml) -> Result<SpaDetection, ExtractionError> {
65        let html_str = decode_bytes(raw);
66        let dom = parse_html(&html_str);
67        let root = dom.document.clone();
68        let text_len = extract_text_length(&root);
69        Ok(detect_spa(&root, text_len, self.config.min_content_length))
70    }
71
72    /// frameset ページのフレーム URL 一覧を返す。空なら通常ページ。
73    pub fn detect_frames(&self, raw: &RawHtml) -> Vec<Url> {
74        let html_str = decode_bytes(raw);
75        let dom = parse_html(&html_str);
76        collect_frame_srcs(&dom.document, &raw.url)
77    }
78}
79
80/// HTTP Content-Type ヘッダーと HTML meta charset を参照してエンコードを検出し UTF-8 に変換する。
81/// 判定できない場合は UTF-8 → latin1 の順でフォールバック。
82fn decode_bytes(raw: &RawHtml) -> String {
83    let charset = extract_charset_from_content_type(&raw.content_type)
84        .or_else(|| sniff_charset_from_bytes(&raw.bytes));
85
86    if let Some(label) = charset {
87        if let Some(enc) = encoding_rs::Encoding::for_label(label.as_bytes()) {
88            let (cow, _, _) = enc.decode(&raw.bytes);
89            return cow.into_owned();
90        }
91    }
92
93    if let Ok(s) = std::str::from_utf8(&raw.bytes) {
94        return s.to_owned();
95    }
96
97    // latin1 フォールバック(文字化けは甘受する)
98    raw.bytes.iter().map(|&b| b as char).collect()
99}
100
101fn parse_html(html: &str) -> RcDom {
102    parse_document(RcDom::default(), Default::default())
103        .from_utf8()
104        .read_from(&mut html.as_bytes())
105        .unwrap_or_default()
106}
107
108/// "text/html; charset=Shift_JIS" → Some("Shift_JIS")
109fn extract_charset_from_content_type(content_type: &str) -> Option<String> {
110    for part in content_type.split(';') {
111        let part = part.trim();
112        if let Some(val) = part.strip_prefix("charset=") {
113            return Some(val.trim_matches('"').to_owned());
114        }
115    }
116    None
117}
118
119/// HTML バイト列の先頭 4096 バイトをバイト列のまま走査し、
120/// `charset=` 属性値を ASCII レベルで抽出する。
121/// Shift_JIS 等の非 UTF-8 バイト列でも meta タグ部分は ASCII のため動作する。
122fn sniff_charset_from_bytes(bytes: &[u8]) -> Option<String> {
123    let head = &bytes[..bytes.len().min(4096)];
124    let needle = b"charset=";
125    let pos = head.windows(needle.len()).position(|w| {
126        w.eq_ignore_ascii_case(needle)
127    })?;
128    let after = &head[pos + needle.len()..];
129    // 引用符を読み飛ばす
130    let after = after.strip_prefix(b"\"").or_else(|| after.strip_prefix(b"'")).unwrap_or(after);
131    let val: Vec<u8> = after
132        .iter()
133        .copied()
134        .take_while(|&b| !matches!(b, b'"' | b'\'' | b';' | b' ' | b'>' | b'\n' | b'\r'))
135        .collect();
136    if val.is_empty() {
137        return None;
138    }
139    String::from_utf8(val).ok().filter(|s| !s.is_empty())
140}
141
142/// DOM 中の <frame src="..."> / <iframe src="..."> の URL を収集する。
143fn collect_frame_srcs(handle: &markup5ever_rcdom::Handle, base: &Url) -> Vec<Url> {
144    let mut result = Vec::new();
145    collect_frame_srcs_inner(handle, base, &mut result);
146    result
147}
148
149fn collect_frame_srcs_inner(
150    handle: &markup5ever_rcdom::Handle,
151    base: &Url,
152    out: &mut Vec<Url>,
153) {
154    if let NodeData::Element { name, attrs, .. } = &handle.data {
155        let tag = name.local.as_ref();
156        if tag == "frame" || tag == "iframe" {
157            if let Some(src) = attrs
158                .borrow()
159                .iter()
160                .find(|a| a.name.local.as_ref() == "src")
161                .map(|a| a.value.as_ref().to_owned())
162            {
163                if let Ok(url) = base.join(&src) {
164                    out.push(url);
165                }
166            }
167        }
168    }
169    for child in handle.children.borrow().iter() {
170        collect_frame_srcs_inner(child, base, out);
171    }
172}