Skip to main content

stillo_core/extractor/
readability.rs

1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use std::collections::HashMap;
4use std::rc::Rc;
5use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
6
7const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
8const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
9const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
10
11pub struct ReadabilityExtractor {
12    pub preserve_links: bool,
13}
14
15impl ReadabilityExtractor {
16    pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
17        let title = extract_title(root);
18        let metadata = extract_metadata(root, base_url);
19        let body = find_body(root);
20
21        let main_node = body.as_ref()
22            .and_then(|b| find_main_content(b))
23            .or(body.clone());
24
25        let (mh, mt, ml) = main_node
26            .as_ref()
27            .map(|n| self.serialize_content(n, base_url))
28            .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
29
30        // 選択されたノードのコンテンツが極端に少ない場合は body 全体を試みる。
31        // ニュース一覧など「全てがリンク」な構造ではスコアリングが個別カードを選びがちなため。
32        let (body_html, body_text, links) = if mt.trim().len() < 200 {
33            if let Some(b) = body.as_ref() {
34                let (bh, bt, bl) = self.serialize_content(b, base_url);
35                if bt.trim().len() > mt.trim().len() {
36                    (bh, bt, bl)
37                } else {
38                    (mh, mt, ml)
39                }
40            } else {
41                (mh, mt, ml)
42            }
43        } else {
44            (mh, mt, ml)
45        };
46
47        ExtractedContent {
48            url: base_url.clone(),
49            title: title.unwrap_or_else(|| base_url.to_string()),
50            byline: metadata.og_title.clone(),
51            body_text,
52            body_html,
53            links,
54            metadata,
55        }
56    }
57
58    fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
59        let mut html = String::new();
60        let mut text = String::new();
61        let mut links = Vec::new();
62        serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
63        (html, text, links)
64    }
65}
66
67fn find_body(root: &Handle) -> Option<Handle> {
68    find_tag(root, "body")
69}
70
71fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
72    if let NodeData::Element { name, .. } = &handle.data {
73        if name.local.as_ref() == tag_name {
74            return Some(handle.clone());
75        }
76    }
77    for child in handle.children.borrow().iter() {
78        if let Some(found) = find_tag(child, tag_name) {
79            return Some(found);
80        }
81    }
82    None
83}
84
85fn find_main_content(body: &Handle) -> Option<Handle> {
86    // <main>, <article> を優先
87    if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
88        return Some(node);
89    }
90
91    // Readability.js 方式: リーフノードのスコアを祖先コンテナへ伝播し、
92    // コンテンツが豊富な大きなブロックが選ばれるようにする。
93    let mut candidates: HashMap<usize, (Handle, f64)> = HashMap::new();
94    let mut ancestors: Vec<Handle> = Vec::new();
95    collect_candidate_scores(body, &mut ancestors, &mut candidates);
96
97    let best = candidates
98        .into_values()
99        .map(|(h, raw)| {
100            let text_len = count_text(&h) as f64;
101            let link_len = count_link_text(&h) as f64;
102            let density = if text_len > 0.0 { link_len / text_len } else { 1.0 };
103            let bonus = class_score(&h);
104            let score = (raw + bonus) * (1.0 - density);
105            (h, score)
106        })
107        .filter(|(_, s)| *s > 0.0)
108        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
109        .map(|(node, _)| node);
110
111    // シブリング展開: ベスト候補が「繰り返しパターンの1ユニット」であれば
112    // その親コンテナへ展開する(例: ul>li>a>div → ul を返す)
113    if let Some(ref node) = best {
114        if let Some(expanded) = try_sibling_expand(node) {
115            return Some(expanded);
116        }
117    }
118    best
119}
120
121/// ベスト候補が繰り返しパターンの1ユニットであれば親コンテナへ展開する。
122///
123/// 例: `ul > li > a > div`(div がベスト候補)の場合、
124/// li が 3 つ以上あることを検出して ul を返す。
125/// 最大 3 段階まで祖先を辿る。
126fn try_sibling_expand(node: &Handle) -> Option<Handle> {
127    let mut current = node.clone();
128    for _ in 0..3 {
129        // rcdom の parent は Cell<Option<Weak<Node>>> なので take→set で安全に参照する
130        let parent = {
131            let weak = current.parent.take();
132            current.parent.set(weak.clone());
133            weak?.upgrade()?
134        };
135
136        let current_tag = match &current.data {
137            NodeData::Element { name, .. } => name.local.as_ref().to_owned(),
138            _ => return None,
139        };
140
141        let same_tag_count = parent.children.borrow().iter()
142            .filter(|c| matches!(&c.data,
143                NodeData::Element { name, .. } if name.local.as_ref() == current_tag))
144            .count();
145
146        if same_tag_count >= 3 {
147            return Some(parent);
148        }
149
150        current = parent;
151    }
152    None
153}
154
155/// p/pre/blockquote/td/li などコンテンツ信号になるノードを起点に、
156/// 祖先コンテナへ最大 4 段階・重みを半減しながらスコアを伝播する。
157fn collect_candidate_scores(
158    handle: &Handle,
159    ancestors: &mut Vec<Handle>,
160    candidates: &mut HashMap<usize, (Handle, f64)>,
161) {
162    if is_noise(handle) {
163        return;
164    }
165
166    if let NodeData::Element { name, .. } = &handle.data {
167        let tag = name.local.as_ref();
168        let score = leaf_content_score(handle, tag);
169
170        if score > 0.0 {
171            let mut weight = 1.0;
172            let mut levels = 0usize;
173            for ancestor in ancestors.iter().rev() {
174                if let NodeData::Element { name: aname, .. } = &ancestor.data {
175                    if is_candidate_tag(aname.local.as_ref()) {
176                        let key = Rc::as_ptr(ancestor) as usize;
177                        candidates
178                            .entry(key)
179                            .or_insert_with(|| (ancestor.clone(), 0.0))
180                            .1 += score * weight;
181                        weight *= 0.5;
182                        levels += 1;
183                        if levels >= 4 {
184                            break;
185                        }
186                    }
187                }
188            }
189        }
190    }
191
192    ancestors.push(handle.clone());
193    for child in handle.children.borrow().iter() {
194        collect_candidate_scores(child, ancestors, candidates);
195    }
196    ancestors.pop();
197}
198
199/// コンテンツ信号となるリーフノードの基礎スコア
200fn leaf_content_score(handle: &Handle, tag: &str) -> f64 {
201    let text_len = count_text(handle) as f64;
202    if text_len < 20.0 {
203        return 0.0;
204    }
205    match tag {
206        "p" => 1.0 + (text_len / 100.0).min(3.0),
207        "pre" | "blockquote" => 3.0 + (text_len / 100.0).min(3.0),
208        "td" => (text_len / 50.0).min(3.0),
209        "li" => 0.5 + (text_len / 200.0).min(1.0),
210        _ => 0.0,
211    }
212}
213
214/// スコアを受け取るコンテナ候補として有効なタグ
215fn is_candidate_tag(tag: &str) -> bool {
216    matches!(tag, "div" | "section" | "article" | "main" | "blockquote" | "pre" | "td" | "tbody" | "p")
217}
218
219fn class_score(handle: &Handle) -> f64 {
220    let attrs = match &handle.data {
221        NodeData::Element { attrs, .. } => attrs.borrow(),
222        _ => return 0.0,
223    };
224
225    let mut score = 0.0;
226    for attr in attrs.iter() {
227        let name = attr.name.local.as_ref();
228        if name != "class" && name != "id" {
229            continue;
230        }
231        let val = attr.value.as_ref().to_lowercase();
232        for pattern in CONTENT_CLASS_PATTERNS {
233            if class_contains_pattern(&val, pattern) {
234                score += 10.0;
235            }
236        }
237        for pattern in NOISE_CLASS_PATTERNS {
238            if class_contains_pattern(&val, pattern) {
239                score -= 10.0;
240            }
241        }
242    }
243    score
244}
245
246fn is_noise(handle: &Handle) -> bool {
247    match &handle.data {
248        NodeData::Element { name, attrs, .. } => {
249            let tag = name.local.as_ref();
250            if NOISE_TAGS.contains(&tag) {
251                return true;
252            }
253            let attrs = attrs.borrow();
254            for attr in attrs.iter() {
255                let aname = attr.name.local.as_ref();
256                if aname != "class" && aname != "id" {
257                    continue;
258                }
259                let val = attr.value.as_ref().to_lowercase();
260                for pattern in NOISE_CLASS_PATTERNS {
261                    if class_contains_pattern(&val, pattern) {
262                        return true;
263                    }
264                }
265            }
266            false
267        }
268        _ => false,
269    }
270}
271
272/// CSSクラス文字列がパターンに一致するか、ハイフン区切りのコンポーネント単位で確認する。
273/// "shadow-2xs" が "ad" にマッチする誤検出を防ぐため、
274/// スペースで個々のクラス名に分割してからハイフンで分解して照合する。
275fn class_contains_pattern(class_val: &str, pattern: &str) -> bool {
276    class_val.split_whitespace().any(|token| {
277        // Tailwind のレスポンシブプレフィックス (sm:, md:, lg: など) を除去
278        let bare = token.split(':').last().unwrap_or(token);
279        // ハイフン区切りのコンポーネントが完全一致するか確認
280        bare.split('-').any(|part| part == pattern)
281    })
282}
283
284fn count_text(handle: &Handle) -> usize {
285    let mut total = 0;
286    count_text_inner(handle, &mut total);
287    total
288}
289
290fn count_text_inner(handle: &Handle, total: &mut usize) {
291    match &handle.data {
292        NodeData::Text { contents } => {
293            *total += contents.borrow().trim().len();
294        }
295        NodeData::Element { name, .. } => {
296            let tag = name.local.as_ref();
297            if tag == "script" || tag == "style" {
298                return;
299            }
300            for child in handle.children.borrow().iter() {
301                count_text_inner(child, total);
302            }
303        }
304        _ => {
305            for child in handle.children.borrow().iter() {
306                count_text_inner(child, total);
307            }
308        }
309    }
310}
311
312fn count_link_text(handle: &Handle) -> usize {
313    let mut total = 0;
314    count_link_text_inner(handle, &mut total, false);
315    total
316}
317
318fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
319    match &handle.data {
320        NodeData::Text { contents } if in_link => {
321            *total += contents.borrow().trim().len();
322        }
323        NodeData::Element { name, .. } => {
324            let tag = name.local.as_ref();
325            let is_link = tag == "a";
326            for child in handle.children.borrow().iter() {
327                count_link_text_inner(child, total, in_link || is_link);
328            }
329        }
330        _ => {}
331    }
332}
333
334fn serialize_node(
335    handle: &Handle,
336    html: &mut String,
337    text: &mut String,
338    links: &mut Vec<ExtractedLink>,
339    base_url: &Url,
340    preserve_links: bool,
341) {
342    if is_noise(handle) {
343        return;
344    }
345
346    match &handle.data {
347        NodeData::Text { contents } => {
348            let t = contents.borrow();
349            let trimmed = t.as_ref();
350            if !trimmed.trim().is_empty() {
351                html.push_str(&html_escape(trimmed));
352                text.push_str(trimmed);
353            }
354        }
355        NodeData::Element { name, attrs, .. } => {
356            let tag = name.local.as_ref();
357            let attrs_ref = attrs.borrow();
358
359            match tag {
360                "script" | "style" | "noscript" | "iframe" => return,
361                "a" if preserve_links => {
362                    let href = attrs_ref.iter()
363                        .find(|a| a.name.local.as_ref() == "href")
364                        .map(|a| a.value.as_ref().to_owned());
365                    let rel = attrs_ref.iter()
366                        .find(|a| a.name.local.as_ref() == "rel")
367                        .map(|a| a.value.as_ref().to_owned());
368
369                    let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
370
371                    html.push_str("<a");
372                    if let Some(ref h) = href {
373                        html.push_str(&format!(" href=\"{}\"", html_escape(h)));
374                    }
375                    html.push('>');
376
377                    let mut link_text = String::new();
378                    let mut link_html = String::new();
379                    for child in handle.children.borrow().iter() {
380                        serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
381                        collect_text(child, &mut link_text);
382                    }
383                    html.push_str(&link_html);
384                    html.push_str("</a>");
385
386                    if let Some(href_url) = resolved {
387                        let trimmed = link_text.trim().to_owned();
388                        // 画像リンク等でアンカーテキストが空のものは除外
389                        if !trimmed.is_empty() {
390                            links.push(ExtractedLink {
391                                text: trimmed,
392                                href: href_url,
393                                rel,
394                            });
395                        }
396                    }
397                    return;
398                }
399                _ => {
400                    // ブロック要素
401                    let is_block = matches!(tag, "p" | "div" | "section" | "article" |
402                        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
403                        "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
404                        "table" | "tr" | "td" | "th" | "thead" | "tbody");
405
406                    if is_block {
407                        html.push('<');
408                        html.push_str(tag);
409                        html.push('>');
410                        if tag == "br" || tag == "hr" {
411                            // self-closing
412                        } else {
413                            for child in handle.children.borrow().iter() {
414                                serialize_node(child, html, text, links, base_url, preserve_links);
415                            }
416                            html.push_str("</");
417                            html.push_str(tag);
418                            html.push('>');
419                        }
420                    } else {
421                        // インライン要素はそのまま子を出力
422                        for child in handle.children.borrow().iter() {
423                            serialize_node(child, html, text, links, base_url, preserve_links);
424                        }
425                    }
426                    return;
427                }
428            }
429        }
430        _ => {}
431    }
432}
433
434fn collect_text(handle: &Handle, out: &mut String) {
435    match &handle.data {
436        NodeData::Text { contents } => {
437            out.push_str(contents.borrow().as_ref());
438        }
439        _ => {
440            for child in handle.children.borrow().iter() {
441                collect_text(child, out);
442            }
443        }
444    }
445}
446
447fn html_escape(s: &str) -> String {
448    s.replace('&', "&amp;")
449        .replace('<', "&lt;")
450        .replace('>', "&gt;")
451        .replace('"', "&quot;")
452}
453
454fn extract_title(root: &Handle) -> Option<String> {
455    // <title> タグを優先、次に <h1> を試みる
456    if let Some(title_node) = find_tag(root, "title") {
457        let mut text = String::new();
458        collect_text(&title_node, &mut text);
459        let trimmed = text.trim().to_owned();
460        if !trimmed.is_empty() {
461            return Some(trimmed);
462        }
463    }
464    if let Some(h1) = find_tag(root, "h1") {
465        let mut text = String::new();
466        collect_text(&h1, &mut text);
467        let trimmed = text.trim().to_owned();
468        if !trimmed.is_empty() {
469            return Some(trimmed);
470        }
471    }
472    None
473}
474
475fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
476    let mut meta = PageMetadata {
477        description: None,
478        og_title: None,
479        og_image: None,
480        canonical: None,
481        published_at: None,
482    };
483    collect_meta(root, &mut meta, base_url);
484    meta
485}
486
487fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
488    if let NodeData::Element { name, attrs, .. } = &handle.data {
489        let tag = name.local.as_ref();
490        let attrs_ref = attrs.borrow();
491
492        if tag == "meta" {
493            let name_attr = attrs_ref.iter()
494                .find(|a| a.name.local.as_ref() == "name")
495                .map(|a| a.value.as_ref().to_lowercase());
496            let property_attr = attrs_ref.iter()
497                .find(|a| a.name.local.as_ref() == "property")
498                .map(|a| a.value.as_ref().to_lowercase());
499            let content = attrs_ref.iter()
500                .find(|a| a.name.local.as_ref() == "content")
501                .map(|a| a.value.as_ref().to_owned());
502
503            match (name_attr.as_deref(), property_attr.as_deref(), content) {
504                (Some("description"), _, Some(c)) => meta.description = Some(c),
505                (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
506                (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
507                _ => {}
508            }
509        } else if tag == "link" {
510            let is_canonical = attrs_ref.iter()
511                .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
512            if is_canonical {
513                if let Some(href) = attrs_ref.iter()
514                    .find(|a| a.name.local.as_ref() == "href")
515                    .and_then(|a| base_url.join(a.value.as_ref()).ok())
516                {
517                    meta.canonical = Some(href);
518                }
519            }
520        }
521    }
522
523    for child in handle.children.borrow().iter() {
524        collect_meta(child, meta, base_url);
525    }
526}