Skip to main content

stillo_core/extractor/
readability.rs

1use markup5ever_rcdom::{Handle, NodeData};
2use url::Url;
3use crate::document::{ExtractedContent, ExtractedLink, PageMetadata};
4
5const NOISE_TAGS: &[&str] = &["nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "form"];
6const NOISE_CLASS_PATTERNS: &[&str] = &["nav", "sidebar", "menu", "ad", "banner", "comment", "footer", "header", "widget"];
7const CONTENT_CLASS_PATTERNS: &[&str] = &["article", "content", "main", "post", "entry", "body", "text"];
8
9pub struct ReadabilityExtractor {
10    pub preserve_links: bool,
11}
12
13impl ReadabilityExtractor {
14    pub fn extract(&self, root: &Handle, base_url: &Url) -> ExtractedContent {
15        let title = extract_title(root);
16        let metadata = extract_metadata(root, base_url);
17        let body = find_body(root);
18
19        let main_node = body.as_ref()
20            .and_then(|b| find_main_content(b))
21            .or(body.clone());
22
23        let (body_html, body_text, links) = main_node
24            .as_ref()
25            .map(|n| self.serialize_content(n, base_url))
26            .unwrap_or_else(|| (String::new(), String::new(), Vec::new()));
27
28        ExtractedContent {
29            url: base_url.clone(),
30            title: title.unwrap_or_else(|| base_url.to_string()),
31            byline: metadata.og_title.clone(),
32            body_text,
33            body_html,
34            links,
35            metadata,
36        }
37    }
38
39    fn serialize_content(&self, handle: &Handle, base_url: &Url) -> (String, String, Vec<ExtractedLink>) {
40        let mut html = String::new();
41        let mut text = String::new();
42        let mut links = Vec::new();
43        serialize_node(handle, &mut html, &mut text, &mut links, base_url, self.preserve_links);
44        (html, text, links)
45    }
46}
47
48fn find_body(root: &Handle) -> Option<Handle> {
49    find_tag(root, "body")
50}
51
52fn find_tag(handle: &Handle, tag_name: &str) -> Option<Handle> {
53    if let NodeData::Element { name, .. } = &handle.data {
54        if name.local.as_ref() == tag_name {
55            return Some(handle.clone());
56        }
57    }
58    for child in handle.children.borrow().iter() {
59        if let Some(found) = find_tag(child, tag_name) {
60            return Some(found);
61        }
62    }
63    None
64}
65
66fn find_main_content(body: &Handle) -> Option<Handle> {
67    // まず <main>, <article> を優先探索
68    if let Some(node) = find_tag(body, "main").or_else(|| find_tag(body, "article")) {
69        return Some(node);
70    }
71
72    // スコアリングでメインコンテンツを特定
73    let mut best: Option<(Handle, f64)> = None;
74    score_nodes(body, &mut best);
75    best.map(|(node, _)| node)
76}
77
78fn score_nodes(handle: &Handle, best: &mut Option<(Handle, f64)>) {
79    if is_noise(handle) {
80        return;
81    }
82
83    if let NodeData::Element { name, .. } = &handle.data {
84        let tag = name.local.as_ref();
85        let score = compute_score(handle, tag);
86        if score > 20.0 {
87            match best {
88                None => *best = Some((handle.clone(), score)),
89                Some((_, best_score)) if score > *best_score => {
90                    *best = Some((handle.clone(), score));
91                }
92                _ => {}
93            }
94        }
95    }
96
97    for child in handle.children.borrow().iter() {
98        score_nodes(child, best);
99    }
100}
101
102fn compute_score(handle: &Handle, tag: &str) -> f64 {
103    let base = match tag {
104        "article" => 30.0,
105        "section" => 10.0,
106        "div" => 5.0,
107        "p" => 3.0,
108        "td" => 3.0,
109        "blockquote" => 3.0,
110        "pre" => 3.0,
111        _ => 0.0,
112    };
113
114    if base == 0.0 {
115        return 0.0;
116    }
117
118    // クラス/IDによる補正
119    let class_bonus = class_score(handle);
120
121    let text_len = count_text(handle) as f64;
122    let link_len = count_link_text(handle) as f64;
123
124    // リンク密度ペナルティ: リンクテキストが多いほど本文らしくない
125    let link_density = if text_len > 0.0 { link_len / text_len } else { 0.0 };
126    let density_penalty = link_density * 50.0;
127
128    base + class_bonus + (text_len * 0.1).min(30.0) - density_penalty
129}
130
131fn class_score(handle: &Handle) -> f64 {
132    let attrs = match &handle.data {
133        NodeData::Element { attrs, .. } => attrs.borrow(),
134        _ => return 0.0,
135    };
136
137    let mut score = 0.0;
138    for attr in attrs.iter() {
139        let name = attr.name.local.as_ref();
140        if name != "class" && name != "id" {
141            continue;
142        }
143        let val = attr.value.as_ref().to_lowercase();
144        for pattern in CONTENT_CLASS_PATTERNS {
145            if val.contains(pattern) {
146                score += 10.0;
147            }
148        }
149        for pattern in NOISE_CLASS_PATTERNS {
150            if val.contains(pattern) {
151                score -= 10.0;
152            }
153        }
154    }
155    score
156}
157
158fn is_noise(handle: &Handle) -> bool {
159    match &handle.data {
160        NodeData::Element { name, attrs, .. } => {
161            let tag = name.local.as_ref();
162            if NOISE_TAGS.contains(&tag) {
163                return true;
164            }
165            let attrs = attrs.borrow();
166            for attr in attrs.iter() {
167                let aname = attr.name.local.as_ref();
168                if aname != "class" && aname != "id" {
169                    continue;
170                }
171                let val = attr.value.as_ref().to_lowercase();
172                for pattern in NOISE_CLASS_PATTERNS {
173                    if val.contains(pattern) {
174                        return true;
175                    }
176                }
177            }
178            false
179        }
180        _ => false,
181    }
182}
183
184fn count_text(handle: &Handle) -> usize {
185    let mut total = 0;
186    count_text_inner(handle, &mut total);
187    total
188}
189
190fn count_text_inner(handle: &Handle, total: &mut usize) {
191    match &handle.data {
192        NodeData::Text { contents } => {
193            *total += contents.borrow().trim().len();
194        }
195        NodeData::Element { name, .. } => {
196            let tag = name.local.as_ref();
197            if tag == "script" || tag == "style" {
198                return;
199            }
200            for child in handle.children.borrow().iter() {
201                count_text_inner(child, total);
202            }
203        }
204        _ => {
205            for child in handle.children.borrow().iter() {
206                count_text_inner(child, total);
207            }
208        }
209    }
210}
211
212fn count_link_text(handle: &Handle) -> usize {
213    let mut total = 0;
214    count_link_text_inner(handle, &mut total, false);
215    total
216}
217
218fn count_link_text_inner(handle: &Handle, total: &mut usize, in_link: bool) {
219    match &handle.data {
220        NodeData::Text { contents } if in_link => {
221            *total += contents.borrow().trim().len();
222        }
223        NodeData::Element { name, .. } => {
224            let tag = name.local.as_ref();
225            let is_link = tag == "a";
226            for child in handle.children.borrow().iter() {
227                count_link_text_inner(child, total, in_link || is_link);
228            }
229        }
230        _ => {}
231    }
232}
233
234fn serialize_node(
235    handle: &Handle,
236    html: &mut String,
237    text: &mut String,
238    links: &mut Vec<ExtractedLink>,
239    base_url: &Url,
240    preserve_links: bool,
241) {
242    if is_noise(handle) {
243        return;
244    }
245
246    match &handle.data {
247        NodeData::Text { contents } => {
248            let t = contents.borrow();
249            let trimmed = t.as_ref();
250            if !trimmed.trim().is_empty() {
251                html.push_str(&html_escape(trimmed));
252                text.push_str(trimmed);
253            }
254        }
255        NodeData::Element { name, attrs, .. } => {
256            let tag = name.local.as_ref();
257            let attrs_ref = attrs.borrow();
258
259            match tag {
260                "script" | "style" | "noscript" | "iframe" => return,
261                "a" if preserve_links => {
262                    let href = attrs_ref.iter()
263                        .find(|a| a.name.local.as_ref() == "href")
264                        .map(|a| a.value.as_ref().to_owned());
265                    let rel = attrs_ref.iter()
266                        .find(|a| a.name.local.as_ref() == "rel")
267                        .map(|a| a.value.as_ref().to_owned());
268
269                    let resolved = href.as_deref().and_then(|h| base_url.join(h).ok());
270
271                    html.push_str("<a");
272                    if let Some(ref h) = href {
273                        html.push_str(&format!(" href=\"{}\"", html_escape(h)));
274                    }
275                    html.push('>');
276
277                    let mut link_text = String::new();
278                    let mut link_html = String::new();
279                    for child in handle.children.borrow().iter() {
280                        serialize_node(child, &mut link_html, text, links, base_url, preserve_links);
281                        collect_text(child, &mut link_text);
282                    }
283                    html.push_str(&link_html);
284                    html.push_str("</a>");
285
286                    if let Some(href_url) = resolved {
287                        links.push(ExtractedLink {
288                            text: link_text.trim().to_owned(),
289                            href: href_url,
290                            rel,
291                        });
292                    }
293                    return;
294                }
295                _ => {
296                    // ブロック要素
297                    let is_block = matches!(tag, "p" | "div" | "section" | "article" |
298                        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" |
299                        "ul" | "ol" | "li" | "blockquote" | "pre" | "br" | "hr" |
300                        "table" | "tr" | "td" | "th" | "thead" | "tbody");
301
302                    if is_block {
303                        html.push('<');
304                        html.push_str(tag);
305                        html.push('>');
306                        if tag == "br" || tag == "hr" {
307                            // self-closing
308                        } else {
309                            for child in handle.children.borrow().iter() {
310                                serialize_node(child, html, text, links, base_url, preserve_links);
311                            }
312                            html.push_str("</");
313                            html.push_str(tag);
314                            html.push('>');
315                        }
316                    } else {
317                        // インライン要素はそのまま子を出力
318                        for child in handle.children.borrow().iter() {
319                            serialize_node(child, html, text, links, base_url, preserve_links);
320                        }
321                    }
322                    return;
323                }
324            }
325        }
326        _ => {}
327    }
328}
329
330fn collect_text(handle: &Handle, out: &mut String) {
331    match &handle.data {
332        NodeData::Text { contents } => {
333            out.push_str(contents.borrow().as_ref());
334        }
335        _ => {
336            for child in handle.children.borrow().iter() {
337                collect_text(child, out);
338            }
339        }
340    }
341}
342
343fn html_escape(s: &str) -> String {
344    s.replace('&', "&amp;")
345        .replace('<', "&lt;")
346        .replace('>', "&gt;")
347        .replace('"', "&quot;")
348}
349
350fn extract_title(root: &Handle) -> Option<String> {
351    // <title> タグを優先、次に <h1> を試みる
352    if let Some(title_node) = find_tag(root, "title") {
353        let mut text = String::new();
354        collect_text(&title_node, &mut text);
355        let trimmed = text.trim().to_owned();
356        if !trimmed.is_empty() {
357            return Some(trimmed);
358        }
359    }
360    if let Some(h1) = find_tag(root, "h1") {
361        let mut text = String::new();
362        collect_text(&h1, &mut text);
363        let trimmed = text.trim().to_owned();
364        if !trimmed.is_empty() {
365            return Some(trimmed);
366        }
367    }
368    None
369}
370
371fn extract_metadata(root: &Handle, base_url: &Url) -> PageMetadata {
372    let mut meta = PageMetadata {
373        description: None,
374        og_title: None,
375        og_image: None,
376        canonical: None,
377        published_at: None,
378    };
379    collect_meta(root, &mut meta, base_url);
380    meta
381}
382
383fn collect_meta(handle: &Handle, meta: &mut PageMetadata, base_url: &Url) {
384    if let NodeData::Element { name, attrs, .. } = &handle.data {
385        let tag = name.local.as_ref();
386        let attrs_ref = attrs.borrow();
387
388        if tag == "meta" {
389            let name_attr = attrs_ref.iter()
390                .find(|a| a.name.local.as_ref() == "name")
391                .map(|a| a.value.as_ref().to_lowercase());
392            let property_attr = attrs_ref.iter()
393                .find(|a| a.name.local.as_ref() == "property")
394                .map(|a| a.value.as_ref().to_lowercase());
395            let content = attrs_ref.iter()
396                .find(|a| a.name.local.as_ref() == "content")
397                .map(|a| a.value.as_ref().to_owned());
398
399            match (name_attr.as_deref(), property_attr.as_deref(), content) {
400                (Some("description"), _, Some(c)) => meta.description = Some(c),
401                (_, Some("og:title"), Some(c)) => meta.og_title = Some(c),
402                (_, Some("og:image"), Some(c)) => meta.og_image = Some(c),
403                _ => {}
404            }
405        } else if tag == "link" {
406            let is_canonical = attrs_ref.iter()
407                .any(|a| a.name.local.as_ref() == "rel" && a.value.as_ref() == "canonical");
408            if is_canonical {
409                if let Some(href) = attrs_ref.iter()
410                    .find(|a| a.name.local.as_ref() == "href")
411                    .and_then(|a| base_url.join(a.value.as_ref()).ok())
412                {
413                    meta.canonical = Some(href);
414                }
415            }
416        }
417    }
418
419    for child in handle.children.borrow().iter() {
420        collect_meta(child, meta, base_url);
421    }
422}