j_agent/util/
html_extract.rs1use scraper::{Html, Selector};
7
8pub fn extract_readable_content(document: &Html) -> String {
13 let content_selectors = [
14 "article",
15 "main",
16 "[role=\"main\"]",
17 ".post-content",
18 ".article-content",
19 ".entry-content",
20 ".content",
21 "#content",
22 ".post",
23 ".article",
24 ];
25
26 for selector_str in content_selectors {
27 if let Ok(selector) = Selector::parse(selector_str)
28 && let Some(element) = document.select(&selector).next()
29 {
30 return element.html();
31 }
32 }
33
34 if let Ok(body_selector) = Selector::parse("body")
35 && let Some(body) = document.select(&body_selector).next()
36 {
37 return body.html();
38 }
39
40 document.html()
41}
42
43pub fn html_to_text(html: &str) -> String {
49 let document = Html::parse_fragment(html);
50 let mut text = String::new();
51
52 fn extract_text(node: scraper::ElementRef, text: &mut String) {
53 for child in node.children() {
54 if let Some(element) = scraper::ElementRef::wrap(child) {
55 let tag = element.value().name();
56 if matches!(
57 tag,
58 "script" | "style" | "nav" | "header" | "footer" | "aside" | "noscript"
59 ) {
60 continue;
61 }
62 if matches!(
63 tag,
64 "p" | "div" | "br" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" | "tr"
65 ) {
66 text.push('\n');
67 }
68 extract_text(element, text);
69 } else if let Some(t) = child.value().as_text() {
70 let trimmed = t.trim();
71 if !trimmed.is_empty() {
72 if !text.is_empty() && !text.ends_with('\n') && !text.ends_with(' ') {
73 text.push(' ');
74 }
75 text.push_str(trimmed);
76 }
77 }
78 }
79 }
80
81 if let Ok(root_selector) = Selector::parse(":root")
82 && let Some(root) = document.select(&root_selector).next()
83 {
84 extract_text(root, &mut text);
85 }
86
87 text.lines()
88 .map(|l| l.trim())
89 .filter(|l| !l.is_empty())
90 .collect::<Vec<_>>()
91 .join("\n")
92}
93
94pub fn extract_text_from_html(raw_html: &str) -> String {
98 let document = Html::parse_document(raw_html);
99 let content_html = extract_readable_content(&document);
100 html_to_text(&content_html)
101}