Skip to main content

j_agent/util/
html_extract.rs

1//! HTML 内容智能提取工具模块
2//!
3//! 提供从 HTML 中提取可读正文内容的公共函数,
4//! 被 `web_fetch` 和 `browser` 模块共同使用。
5
6use scraper::{Html, Selector};
7
8/// 智能提取网页正文区域的 HTML
9///
10/// 按优先级尝试 article / main / 常见内容 class,
11/// 匹配不到则回退到 `<body>`,最后回退到整个文档。
12pub fn extract_readable_content(document: &Html) -> String {
13    let content_selectors = [
14        "article",
15        "main",
16        "[role=\"main\"]",
17        ".post-content",
18        ".article-content",
19        ".entry-content",
20        ".content",
21        "#content",
22        ".post",
23        ".article",
24    ];
25
26    for selector_str in content_selectors {
27        if let Ok(selector) = Selector::parse(selector_str)
28            && let Some(element) = document.select(&selector).next()
29        {
30            return element.html();
31        }
32    }
33
34    if let Ok(body_selector) = Selector::parse("body")
35        && let Some(body) = document.select(&body_selector).next()
36    {
37        return body.html();
38    }
39
40    document.html()
41}
42
43/// 将 HTML 转换为干净的纯文本
44///
45/// - 跳过 script / style / nav / header / footer / aside / noscript
46/// - 在块级元素处插入换行以保持可读性
47/// - 去除空行和多余空白
48pub fn html_to_text(html: &str) -> String {
49    let document = Html::parse_fragment(html);
50    let mut text = String::new();
51
52    fn extract_text(node: scraper::ElementRef, text: &mut String) {
53        for child in node.children() {
54            if let Some(element) = scraper::ElementRef::wrap(child) {
55                let tag = element.value().name();
56                if matches!(
57                    tag,
58                    "script" | "style" | "nav" | "header" | "footer" | "aside" | "noscript"
59                ) {
60                    continue;
61                }
62                if matches!(
63                    tag,
64                    "p" | "div" | "br" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" | "tr"
65                ) {
66                    text.push('\n');
67                }
68                extract_text(element, text);
69            } else if let Some(t) = child.value().as_text() {
70                let trimmed = t.trim();
71                if !trimmed.is_empty() {
72                    if !text.is_empty() && !text.ends_with('\n') && !text.ends_with(' ') {
73                        text.push(' ');
74                    }
75                    text.push_str(trimmed);
76                }
77            }
78        }
79    }
80
81    if let Ok(root_selector) = Selector::parse(":root")
82        && let Some(root) = document.select(&root_selector).next()
83    {
84        extract_text(root, &mut text);
85    }
86
87    text.lines()
88        .map(|l| l.trim())
89        .filter(|l| !l.is_empty())
90        .collect::<Vec<_>>()
91        .join("\n")
92}
93
94/// 从原始 HTML 中提取可读正文并转为纯文本
95///
96/// 组合 `extract_readable_content` + `html_to_text` 的便捷方法。
97pub fn extract_text_from_html(raw_html: &str) -> String {
98    let document = Html::parse_document(raw_html);
99    let content_html = extract_readable_content(&document);
100    html_to_text(&content_html)
101}