Skip to main content

stillo_core/extractor/
readability.rs

1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use std::collections::HashMap;
4use std::rc::Rc;
5use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
6
7const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
8const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
9const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
10
11pub struct ReadabilityExtractor {
12    pub preserve_links: bool,
13}
14
15impl ReadabilityExtractor {
16    pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
17        let title = extract_title(root);
18        let metadata = extract_metadata(root, base_url);
19        let body = find_body(root);
20
21        let main_node = body.as_ref()
22            .and_then(|b| find_main_content(b))
23            .or(body.clone());
24
25        let (mh, mt, ml) = main_node
26            .as_ref()
27            .map(|n| self.serialize_content(n, base_url))
28            .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
29
30        // 選択されたノードのコンテンツが極端に少ない場合は body 全体を試みる。
31        // ニュース一覧など「全てがリンク」な構造ではスコアリングが個別カードを選びがちなため。
32        let (body_html, body_text, links) = if mt.trim().len() < 200 {
33            if let Some(b) = body.as_ref() {
34                let (bh, bt, bl) = self.serialize_content(b, base_url);
35                if bt.trim().len() > mt.trim().len() {
36                    (bh, bt, bl)
37                } else {
38                    (mh, mt, ml)
39                }
40            } else {
41                (mh, mt, ml)
42            }
43        } else {
44            (mh, mt, ml)
45        };
46
47        ExtractedContent {
48            url: base_url.clone(),
49            title: title.unwrap_or_else(|| base_url.to_string()),
50            byline: metadata.og_title.clone(),
51            body_text,
52            body_html,
53            links,
54            metadata,
55        }
56    }
57
58    fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
59        let mut html = String::new();
60        let mut text = String::new();
61        let mut links = Vec::new();
62        serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
63        (html, text, links)
64    }
65}
66
67fn find_body(root: &Handle) -> Option<Handle> {
68    find_tag(root, "body")
69}
70
71fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
72    if let NodeData::Element { name, .. } = &handle.data {
73        if name.local.as_ref() == tag_name {
74            return Some(handle.clone());
75        }
76    }
77    for child in handle.children.borrow().iter() {
78        if let Some(found) = find_tag(child, tag_name) {
79            return Some(found);
80        }
81    }
82    None
83}
84
85fn find_main_content(body: &Handle) -> Option<Handle> {
86    // <main>, <article> を優先
87    if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
88        return Some(node);
89    }
90
91    // Readability.js 方式: リーフノードのスコアを祖先コンテナへ伝播し、
92    // コンテンツが豊富な大きなブロックが選ばれるようにする。
93    let mut candidates: HashMap<usize, (Handle, f64)> = HashMap::new();
94    let mut ancestors: Vec<Handle> = Vec::new();
95    collect_candidate_scores(body, &mut ancestors, &mut candidates);
96
97    let best = candidates
98        .into_values()
99        .map(|(h, raw)| {
100            let text_len = count_text(&h) as f64;
101            let link_len = count_link_text(&h) as f64;
102            let density = if text_len > 0.0 { link_len / text_len } else { 1.0 };
103            let bonus = class_score(&h);
104            let score = (raw + bonus) * (1.0 - density);
105            (h, score)
106        })
107        .filter(|(_, s)| *s > 0.0)
108        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
109        .map(|(node, _)| node);
110
111    // シブリング展開: ベスト候補が「繰り返しパターンの1ユニット」であれば
112    // その親コンテナへ展開する(例: ul>li>a>div → ul を返す)
113    if let Some(ref node) = best {
114        if let Some(expanded) = try_sibling_expand(node) {
115            return Some(expanded);
116        }
117    }
118    best
119}
120
121/// ベスト候補が繰り返しパターンの1ユニットであれば親コンテナへ展開する。
122///
123/// 例: `ul > li > a > div`(div がベスト候補)の場合、
124/// li が 3 つ以上あることを検出して ul を返す。
125/// 最大 3 段階まで祖先を辿る。
126fn try_sibling_expand(node: &Handle) -> Option<Handle> {
127    let mut current = node.clone();
128    for _ in 0..3 {
129        // rcdom の parent は Cell<Option<Weak<Node>>> なので take→set で安全に参照する
130        let parent = {
131            let weak = current.parent.take();
132            current.parent.set(weak.clone());
133            weak?.upgrade()?
134        };
135
136        let current_tag = match &current.data {
137            NodeData::Element { name, .. } => name.local.as_ref().to_owned(),
138            _ => return None,
139        };
140
141        let same_tag_count = parent.children.borrow().iter()
142            .filter(|c| matches!(&c.data,
143                NodeData::Element { name, .. } if name.local.as_ref() == current_tag))
144            .count();
145
146        if same_tag_count >= 3 {
147            return Some(parent);
148        }
149
150        current = parent;
151    }
152    None
153}
154
155/// p/pre/blockquote/td/li などコンテンツ信号になるノードを起点に、
156/// 祖先コンテナへ最大 4 段階・重みを半減しながらスコアを伝播する。
157fn collect_candidate_scores(
158    handle: &Handle,
159    ancestors: &mut Vec<Handle>,
160    candidates: &mut HashMap<usize, (Handle, f64)>,
161) {
162    if is_noise(handle) {
163        return;
164    }
165
166    if let NodeData::Element { name, .. } = &handle.data {
167        let tag = name.local.as_ref();
168        let score = leaf_content_score(handle, tag);
169
170        if score > 0.0 {
171            let mut weight = 1.0;
172            let mut levels = 0usize;
173            for ancestor in ancestors.iter().rev() {
174                if let NodeData::Element { name: aname, .. } = &ancestor.data {
175                    if is_candidate_tag(aname.local.as_ref()) {
176                        let key = Rc::as_ptr(ancestor) as usize;
177                        candidates
178                            .entry(key)
179                            .or_insert_with(|| (ancestor.clone(), 0.0))
180                            .1 += score * weight;
181                        weight *= 0.5;
182                        levels += 1;
183                        if levels >= 4 {
184                            break;
185                        }
186                    }
187                }
188            }
189        }
190    }
191
192    ancestors.push(handle.clone());
193    for child in handle.children.borrow().iter() {
194        collect_candidate_scores(child, ancestors, candidates);
195    }
196    ancestors.pop();
197}
198
199/// コンテンツ信号となるリーフノードの基礎スコア
200fn leaf_content_score(handle: &Handle, tag: &str) -> f64 {
201    let text_len = count_text(handle) as f64;
202    if text_len < 20.0 {
203        return 0.0;
204    }
205    match tag {
206        "p" => 1.0 + (text_len / 100.0).min(3.0),
207        "pre" | "blockquote" => 3.0 + (text_len / 100.0).min(3.0),
208        "td" => (text_len / 50.0).min(3.0),
209        "li" => 0.5 + (text_len / 200.0).min(1.0),
210        _ => 0.0,
211    }
212}
213
214/// スコアを受け取るコンテナ候補として有効なタグ
215fn is_candidate_tag(tag: &str) -> bool {
216    matches!(tag, "div" | "section" | "article" | "main" | "blockquote" | "pre" | "td" | "tbody" | "p")
217}
218
219fn class_score(handle: &Handle) -> f64 {
220    let attrs = match &handle.data {
221        NodeData::Element { attrs, .. } => attrs.borrow(),
222        _ => return 0.0,
223    };
224
225    let mut score = 0.0;
226    for attr in attrs.iter() {
227        let name = attr.name.local.as_ref();
228        if name != "class" && name != "id" {
229            continue;
230        }
231        let val = attr.value.as_ref().to_lowercase();
232        for pattern in CONTENT_CLASS_PATTERNS {
233            if val.contains(pattern) {
234                score += 10.0;
235            }
236        }
237        for pattern in NOISE_CLASS_PATTERNS {
238            if val.contains(pattern) {
239                score -= 10.0;
240            }
241        }
242    }
243    score
244}
245
246fn is_noise(handle: &Handle) -> bool {
247    match &handle.data {
248        NodeData::Element { name, attrs, .. } => {
249            let tag = name.local.as_ref();
250            if NOISE_TAGS.contains(&tag) {
251                return true;
252            }
253            let attrs = attrs.borrow();
254            for attr in attrs.iter() {
255                let aname = attr.name.local.as_ref();
256                if aname != "class" && aname != "id" {
257                    continue;
258                }
259                let val = attr.value.as_ref().to_lowercase();
260                for pattern in NOISE_CLASS_PATTERNS {
261                    if val.contains(pattern) {
262                        return true;
263                    }
264                }
265            }
266            false
267        }
268        _ => false,
269    }
270}
271
272fn count_text(handle: &Handle) -> usize {
273    let mut total = 0;
274    count_text_inner(handle, &mut total);
275    total
276}
277
278fn count_text_inner(handle: &Handle, total: &mut usize) {
279    match &handle.data {
280        NodeData::Text { contents } => {
281            *total += contents.borrow().trim().len();
282        }
283        NodeData::Element { name, .. } => {
284            let tag = name.local.as_ref();
285            if tag == "script" || tag == "style" {
286                return;
287            }
288            for child in handle.children.borrow().iter() {
289                count_text_inner(child, total);
290            }
291        }
292        _ => {
293            for child in handle.children.borrow().iter() {
294                count_text_inner(child, total);
295            }
296        }
297    }
298}
299
300fn count_link_text(handle: &Handle) -> usize {
301    let mut total = 0;
302    count_link_text_inner(handle, &mut total, false);
303    total
304}
305
306fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
307    match &handle.data {
308        NodeData::Text { contents } if in_link => {
309            *total += contents.borrow().trim().len();
310        }
311        NodeData::Element { name, .. } => {
312            let tag = name.local.as_ref();
313            let is_link = tag == "a";
314            for child in handle.children.borrow().iter() {
315                count_link_text_inner(child, total, in_link || is_link);
316            }
317        }
318        _ => {}
319    }
320}
321
322fn serialize_node(
323    handle: &Handle,
324    html: &mut String,
325    text: &mut String,
326    links: &mut Vec<ExtractedLink>,
327    base_url: &Url,
328    preserve_links: bool,
329) {
330    if is_noise(handle) {
331        return;
332    }
333
334    match &handle.data {
335        NodeData::Text { contents } => {
336            let t = contents.borrow();
337            let trimmed = t.as_ref();
338            if !trimmed.trim().is_empty() {
339                html.push_str(&html_escape(trimmed));
340                text.push_str(trimmed);
341            }
342        }
343        NodeData::Element { name, attrs, .. } => {
344            let tag = name.local.as_ref();
345            let attrs_ref = attrs.borrow();
346
347            match tag {
348                "script" | "style" | "noscript" | "iframe" => return,
349                "a" if preserve_links => {
350                    let href = attrs_ref.iter()
351                        .find(|a| a.name.local.as_ref() == "href")
352                        .map(|a| a.value.as_ref().to_owned());
353                    let rel = attrs_ref.iter()
354                        .find(|a| a.name.local.as_ref() == "rel")
355                        .map(|a| a.value.as_ref().to_owned());
356
357                    let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
358
359                    html.push_str("<a");
360                    if let Some(ref h) = href {
361                        html.push_str(&format!(" href=\"{}\"", html_escape(h)));
362                    }
363                    html.push('>');
364
365                    let mut link_text = String::new();
366                    let mut link_html = String::new();
367                    for child in handle.children.borrow().iter() {
368                        serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
369                        collect_text(child, &mut link_text);
370                    }
371                    html.push_str(&link_html);
372                    html.push_str("</a>");
373
374                    if let Some(href_url) = resolved {
375                        let trimmed = link_text.trim().to_owned();
376                        // 画像リンク等でアンカーテキストが空のものは除外
377                        if !trimmed.is_empty() {
378                            links.push(ExtractedLink {
379                                text: trimmed,
380                                href: href_url,
381                                rel,
382                            });
383                        }
384                    }
385                    return;
386                }
387                _ => {
388                    // ブロック要素
389                    let is_block = matches!(tag, "p" | "div" | "section" | "article" |
390                        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
391                        "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
392                        "table" | "tr" | "td" | "th" | "thead" | "tbody");
393
394                    if is_block {
395                        html.push('<');
396                        html.push_str(tag);
397                        html.push('>');
398                        if tag == "br" || tag == "hr" {
399                            // self-closing
400                        } else {
401                            for child in handle.children.borrow().iter() {
402                                serialize_node(child, html, text, links, base_url, preserve_links);
403                            }
404                            html.push_str("</");
405                            html.push_str(tag);
406                            html.push('>');
407                        }
408                    } else {
409                        // インライン要素はそのまま子を出力
410                        for child in handle.children.borrow().iter() {
411                            serialize_node(child, html, text, links, base_url, preserve_links);
412                        }
413                    }
414                    return;
415                }
416            }
417        }
418        _ => {}
419    }
420}
421
422fn collect_text(handle: &Handle, out: &mut String) {
423    match &handle.data {
424        NodeData::Text { contents } => {
425            out.push_str(contents.borrow().as_ref());
426        }
427        _ => {
428            for child in handle.children.borrow().iter() {
429                collect_text(child, out);
430            }
431        }
432    }
433}
434
435fn html_escape(s: &str) -> String {
436    s.replace('&', "&amp;")
437        .replace('<', "&lt;")
438        .replace('>', "&gt;")
439        .replace('"', "&quot;")
440}
441
442fn extract_title(root: &Handle) -> Option<String> {
443    // <title> タグを優先、次に <h1> を試みる
444    if let Some(title_node) = find_tag(root, "title") {
445        let mut text = String::new();
446        collect_text(&title_node, &mut text);
447        let trimmed = text.trim().to_owned();
448        if !trimmed.is_empty() {
449            return Some(trimmed);
450        }
451    }
452    if let Some(h1) = find_tag(root, "h1") {
453        let mut text = String::new();
454        collect_text(&h1, &mut text);
455        let trimmed = text.trim().to_owned();
456        if !trimmed.is_empty() {
457            return Some(trimmed);
458        }
459    }
460    None
461}
462
463fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
464    let mut meta = PageMetadata {
465        description: None,
466        og_title: None,
467        og_image: None,
468        canonical: None,
469        published_at: None,
470    };
471    collect_meta(root, &mut meta, base_url);
472    meta
473}
474
475fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
476    if let NodeData::Element { name, attrs, .. } = &handle.data {
477        let tag = name.local.as_ref();
478        let attrs_ref = attrs.borrow();
479
480        if tag == "meta" {
481            let name_attr = attrs_ref.iter()
482                .find(|a| a.name.local.as_ref() == "name")
483                .map(|a| a.value.as_ref().to_lowercase());
484            let property_attr = attrs_ref.iter()
485                .find(|a| a.name.local.as_ref() == "property")
486                .map(|a| a.value.as_ref().to_lowercase());
487            let content = attrs_ref.iter()
488                .find(|a| a.name.local.as_ref() == "content")
489                .map(|a| a.value.as_ref().to_owned());
490
491            match (name_attr.as_deref(), property_attr.as_deref(), content) {
492                (Some("description"), _, Some(c)) => meta.description = Some(c),
493                (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
494                (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
495                _ => {}
496            }
497        } else if tag == "link" {
498            let is_canonical = attrs_ref.iter()
499                .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
500            if is_canonical {
501                if let Some(href) = attrs_ref.iter()
502                    .find(|a| a.name.local.as_ref() == "href")
503                    .and_then(|a| base_url.join(a.value.as_ref()).ok())
504                {
505                    meta.canonical = Some(href);
506                }
507            }
508        }
509    }
510
511    for child in handle.children.borrow().iter() {
512        collect_meta(child, meta, base_url);
513    }
514}