Skip to main content

stillo_core/extractor/
readability.rs

1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use std::collections::HashMap;
4use std::rc::Rc;
5use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
6
7const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
8const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
9const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
10
11pub struct ReadabilityExtractor {
12    pub preserve_links: bool,
13}
14
15impl ReadabilityExtractor {
16    pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
17        let title = extract_title(root);
18        let metadata = extract_metadata(root, base_url);
19        let body = find_body(root);
20
21        let main_node = body.as_ref()
22            .and_then(|b| find_main_content(b))
23            .or(body.clone());
24
25        let (mh, mt, ml) = main_node
26            .as_ref()
27            .map(|n| self.serialize_content(n, base_url))
28            .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
29
30        // 選択されたノードのコンテンツが極端に少ない場合は body 全体を試みる。
31        // ニュース一覧など「全てがリンク」な構造ではスコアリングが個別カードを選びがちなため。
32        let (body_html, body_text, links) = if mt.trim().len() < 200 {
33            if let Some(b) = body.as_ref() {
34                let (bh, bt, bl) = self.serialize_content(b, base_url);
35                if bt.trim().len() > mt.trim().len() {
36                    (bh, bt, bl)
37                } else {
38                    (mh, mt, ml)
39                }
40            } else {
41                (mh, mt, ml)
42            }
43        } else {
44            (mh, mt, ml)
45        };
46
47        ExtractedContent {
48            url: base_url.clone(),
49            title: title.unwrap_or_else(|| base_url.to_string()),
50            byline: metadata.og_title.clone(),
51            body_text,
52            body_html,
53            links,
54            metadata,
55        }
56    }
57
58    fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
59        let mut html = String::new();
60        let mut text = String::new();
61        let mut links = Vec::new();
62        serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
63        (html, text, links)
64    }
65}
66
67fn find_body(root: &Handle) -> Option<Handle> {
68    find_tag(root, "body")
69}
70
71fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
72    if let NodeData::Element { name, .. } = &handle.data {
73        if name.local.as_ref() == tag_name {
74            return Some(handle.clone());
75        }
76    }
77    for child in handle.children.borrow().iter() {
78        if let Some(found) = find_tag(child, tag_name) {
79            return Some(found);
80        }
81    }
82    None
83}
84
85fn find_main_content(body: &Handle) -> Option<Handle> {
86    // <main>, <article> を優先
87    if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
88        return Some(node);
89    }
90
91    // Readability.js 方式: リーフノードのスコアを祖先コンテナへ伝播し、
92    // コンテンツが豊富な大きなブロックが選ばれるようにする。
93    let mut candidates: HashMap<usize, (Handle, f64)> = HashMap::new();
94    let mut ancestors: Vec<Handle> = Vec::new();
95    collect_candidate_scores(body, &mut ancestors, &mut candidates);
96
97    candidates
98        .into_values()
99        .map(|(h, raw)| {
100            let text_len = count_text(&h) as f64;
101            let link_len = count_link_text(&h) as f64;
102            let density = if text_len > 0.0 { link_len / text_len } else { 1.0 };
103            let bonus = class_score(&h);
104            let score = (raw + bonus) * (1.0 - density);
105            (h, score)
106        })
107        .filter(|(_, s)| *s > 0.0)
108        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
109        .map(|(node, _)| node)
110}
111
112/// p/pre/blockquote/td/li などコンテンツ信号になるノードを起点に、
113/// 祖先コンテナへ最大 4 段階・重みを半減しながらスコアを伝播する。
114fn collect_candidate_scores(
115    handle: &Handle,
116    ancestors: &mut Vec<Handle>,
117    candidates: &mut HashMap<usize, (Handle, f64)>,
118) {
119    if is_noise(handle) {
120        return;
121    }
122
123    if let NodeData::Element { name, .. } = &handle.data {
124        let tag = name.local.as_ref();
125        let score = leaf_content_score(handle, tag);
126
127        if score > 0.0 {
128            let mut weight = 1.0;
129            let mut levels = 0usize;
130            for ancestor in ancestors.iter().rev() {
131                if let NodeData::Element { name: aname, .. } = &ancestor.data {
132                    if is_candidate_tag(aname.local.as_ref()) {
133                        let key = Rc::as_ptr(ancestor) as usize;
134                        candidates
135                            .entry(key)
136                            .or_insert_with(|| (ancestor.clone(), 0.0))
137                            .1 += score * weight;
138                        weight *= 0.5;
139                        levels += 1;
140                        if levels >= 4 {
141                            break;
142                        }
143                    }
144                }
145            }
146        }
147    }
148
149    ancestors.push(handle.clone());
150    for child in handle.children.borrow().iter() {
151        collect_candidate_scores(child, ancestors, candidates);
152    }
153    ancestors.pop();
154}
155
156/// コンテンツ信号となるリーフノードの基礎スコア
157fn leaf_content_score(handle: &Handle, tag: &str) -> f64 {
158    let text_len = count_text(handle) as f64;
159    if text_len < 20.0 {
160        return 0.0;
161    }
162    match tag {
163        "p" => 1.0 + (text_len / 100.0).min(3.0),
164        "pre" | "blockquote" => 3.0 + (text_len / 100.0).min(3.0),
165        "td" => (text_len / 50.0).min(3.0),
166        "li" => 0.5 + (text_len / 200.0).min(1.0),
167        _ => 0.0,
168    }
169}
170
171/// スコアを受け取るコンテナ候補として有効なタグ
172fn is_candidate_tag(tag: &str) -> bool {
173    matches!(tag, "div" | "section" | "article" | "main" | "blockquote" | "pre" | "td" | "tbody" | "p")
174}
175
176fn class_score(handle: &Handle) -> f64 {
177    let attrs = match &handle.data {
178        NodeData::Element { attrs, .. } => attrs.borrow(),
179        _ => return 0.0,
180    };
181
182    let mut score = 0.0;
183    for attr in attrs.iter() {
184        let name = attr.name.local.as_ref();
185        if name != "class" && name != "id" {
186            continue;
187        }
188        let val = attr.value.as_ref().to_lowercase();
189        for pattern in CONTENT_CLASS_PATTERNS {
190            if val.contains(pattern) {
191                score += 10.0;
192            }
193        }
194        for pattern in NOISE_CLASS_PATTERNS {
195            if val.contains(pattern) {
196                score -= 10.0;
197            }
198        }
199    }
200    score
201}
202
203fn is_noise(handle: &Handle) -> bool {
204    match &handle.data {
205        NodeData::Element { name, attrs, .. } => {
206            let tag = name.local.as_ref();
207            if NOISE_TAGS.contains(&tag) {
208                return true;
209            }
210            let attrs = attrs.borrow();
211            for attr in attrs.iter() {
212                let aname = attr.name.local.as_ref();
213                if aname != "class" && aname != "id" {
214                    continue;
215                }
216                let val = attr.value.as_ref().to_lowercase();
217                for pattern in NOISE_CLASS_PATTERNS {
218                    if val.contains(pattern) {
219                        return true;
220                    }
221                }
222            }
223            false
224        }
225        _ => false,
226    }
227}
228
229fn count_text(handle: &Handle) -> usize {
230    let mut total = 0;
231    count_text_inner(handle, &mut total);
232    total
233}
234
235fn count_text_inner(handle: &Handle, total: &mut usize) {
236    match &handle.data {
237        NodeData::Text { contents } => {
238            *total += contents.borrow().trim().len();
239        }
240        NodeData::Element { name, .. } => {
241            let tag = name.local.as_ref();
242            if tag == "script" || tag == "style" {
243                return;
244            }
245            for child in handle.children.borrow().iter() {
246                count_text_inner(child, total);
247            }
248        }
249        _ => {
250            for child in handle.children.borrow().iter() {
251                count_text_inner(child, total);
252            }
253        }
254    }
255}
256
257fn count_link_text(handle: &Handle) -> usize {
258    let mut total = 0;
259    count_link_text_inner(handle, &mut total, false);
260    total
261}
262
263fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
264    match &handle.data {
265        NodeData::Text { contents } if in_link => {
266            *total += contents.borrow().trim().len();
267        }
268        NodeData::Element { name, .. } => {
269            let tag = name.local.as_ref();
270            let is_link = tag == "a";
271            for child in handle.children.borrow().iter() {
272                count_link_text_inner(child, total, in_link || is_link);
273            }
274        }
275        _ => {}
276    }
277}
278
279fn serialize_node(
280    handle: &Handle,
281    html: &mut String,
282    text: &mut String,
283    links: &mut Vec<ExtractedLink>,
284    base_url: &Url,
285    preserve_links: bool,
286) {
287    if is_noise(handle) {
288        return;
289    }
290
291    match &handle.data {
292        NodeData::Text { contents } => {
293            let t = contents.borrow();
294            let trimmed = t.as_ref();
295            if !trimmed.trim().is_empty() {
296                html.push_str(&html_escape(trimmed));
297                text.push_str(trimmed);
298            }
299        }
300        NodeData::Element { name, attrs, .. } => {
301            let tag = name.local.as_ref();
302            let attrs_ref = attrs.borrow();
303
304            match tag {
305                "script" | "style" | "noscript" | "iframe" => return,
306                "a" if preserve_links => {
307                    let href = attrs_ref.iter()
308                        .find(|a| a.name.local.as_ref() == "href")
309                        .map(|a| a.value.as_ref().to_owned());
310                    let rel = attrs_ref.iter()
311                        .find(|a| a.name.local.as_ref() == "rel")
312                        .map(|a| a.value.as_ref().to_owned());
313
314                    let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
315
316                    html.push_str("<a");
317                    if let Some(ref h) = href {
318                        html.push_str(&format!(" href=\"{}\"", html_escape(h)));
319                    }
320                    html.push('>');
321
322                    let mut link_text = String::new();
323                    let mut link_html = String::new();
324                    for child in handle.children.borrow().iter() {
325                        serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
326                        collect_text(child, &mut link_text);
327                    }
328                    html.push_str(&link_html);
329                    html.push_str("</a>");
330
331                    if let Some(href_url) = resolved {
332                        let trimmed = link_text.trim().to_owned();
333                        // 画像リンク等でアンカーテキストが空のものは除外
334                        if !trimmed.is_empty() {
335                            links.push(ExtractedLink {
336                                text: trimmed,
337                                href: href_url,
338                                rel,
339                            });
340                        }
341                    }
342                    return;
343                }
344                _ => {
345                    // ブロック要素
346                    let is_block = matches!(tag, "p" | "div" | "section" | "article" |
347                        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
348                        "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
349                        "table" | "tr" | "td" | "th" | "thead" | "tbody");
350
351                    if is_block {
352                        html.push('<');
353                        html.push_str(tag);
354                        html.push('>');
355                        if tag == "br" || tag == "hr" {
356                            // self-closing
357                        } else {
358                            for child in handle.children.borrow().iter() {
359                                serialize_node(child, html, text, links, base_url, preserve_links);
360                            }
361                            html.push_str("</");
362                            html.push_str(tag);
363                            html.push('>');
364                        }
365                    } else {
366                        // インライン要素はそのまま子を出力
367                        for child in handle.children.borrow().iter() {
368                            serialize_node(child, html, text, links, base_url, preserve_links);
369                        }
370                    }
371                    return;
372                }
373            }
374        }
375        _ => {}
376    }
377}
378
379fn collect_text(handle: &Handle, out: &mut String) {
380    match &handle.data {
381        NodeData::Text { contents } => {
382            out.push_str(contents.borrow().as_ref());
383        }
384        _ => {
385            for child in handle.children.borrow().iter() {
386                collect_text(child, out);
387            }
388        }
389    }
390}
391
392fn html_escape(s: &str) -> String {
393    s.replace('&', "&amp;")
394        .replace('<', "&lt;")
395        .replace('>', "&gt;")
396        .replace('"', "&quot;")
397}
398
399fn extract_title(root: &Handle) -> Option<String> {
400    // <title> タグを優先、次に <h1> を試みる
401    if let Some(title_node) = find_tag(root, "title") {
402        let mut text = String::new();
403        collect_text(&title_node, &mut text);
404        let trimmed = text.trim().to_owned();
405        if !trimmed.is_empty() {
406            return Some(trimmed);
407        }
408    }
409    if let Some(h1) = find_tag(root, "h1") {
410        let mut text = String::new();
411        collect_text(&h1, &mut text);
412        let trimmed = text.trim().to_owned();
413        if !trimmed.is_empty() {
414            return Some(trimmed);
415        }
416    }
417    None
418}
419
420fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
421    let mut meta = PageMetadata {
422        description: None,
423        og_title: None,
424        og_image: None,
425        canonical: None,
426        published_at: None,
427    };
428    collect_meta(root, &mut meta, base_url);
429    meta
430}
431
432fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
433    if let NodeData::Element { name, attrs, .. } = &handle.data {
434        let tag = name.local.as_ref();
435        let attrs_ref = attrs.borrow();
436
437        if tag == "meta" {
438            let name_attr = attrs_ref.iter()
439                .find(|a| a.name.local.as_ref() == "name")
440                .map(|a| a.value.as_ref().to_lowercase());
441            let property_attr = attrs_ref.iter()
442                .find(|a| a.name.local.as_ref() == "property")
443                .map(|a| a.value.as_ref().to_lowercase());
444            let content = attrs_ref.iter()
445                .find(|a| a.name.local.as_ref() == "content")
446                .map(|a| a.value.as_ref().to_owned());
447
448            match (name_attr.as_deref(), property_attr.as_deref(), content) {
449                (Some("description"), _, Some(c)) => meta.description = Some(c),
450                (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
451                (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
452                _ => {}
453            }
454        } else if tag == "link" {
455            let is_canonical = attrs_ref.iter()
456                .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
457            if is_canonical {
458                if let Some(href) = attrs_ref.iter()
459                    .find(|a| a.name.local.as_ref() == "href")
460                    .and_then(|a| base_url.join(a.value.as_ref()).ok())
461                {
462                    meta.canonical = Some(href);
463                }
464            }
465        }
466    }
467
468    for child in handle.children.borrow().iter() {
469        collect_meta(child, meta, base_url);
470    }
471}