content_extractor_rl/
html_parser.rs1use scraper::{Html, Selector, ElementRef, Element};
8use crate::Result;
9
10
11pub struct HtmlParser;
12
13impl HtmlParser {
14
15 pub fn parse(html: &str) -> Result<Html> {
17 Ok(Html::parse_document(html))
18 }
19
20 pub fn extract_text(element: ElementRef) -> String {
22 element.text().collect::<Vec<_>>().join(" ")
23 }
24
25 pub fn get_element_path(element: ElementRef) -> String {
27 let mut path = Vec::new();
28 let mut current = Some(element);
29
30 while let Some(elem) = current {
31 let tag = elem.value().name();
32
33 let position = elem.prev_siblings()
35 .filter(|s| s.value().as_element().is_some_and(|e| e.name() == tag))
36 .count() + 1;
37
38 path.push(format!("{}[{}]", tag, position));
39 current = elem.parent().and_then(ElementRef::wrap);
40 }
41
42 path.reverse();
43 format!("/{}", path.join("/"))
44 }
45
46
47 pub fn clean_html(html: &str) -> Result<Html> {
49 let document = Html::parse_document(html);
50
51 let mut cleaned = html.to_string();
53
54 let script_re = regex::Regex::new(r"(?is)<script\b[^>]*>.*?</script>").unwrap();
56 cleaned = script_re.replace_all(&cleaned, "").to_string();
57
58 let style_re = regex::Regex::new(r"(?is)<style\b[^>]*>.*?</style>").unwrap();
60 cleaned = style_re.replace_all(&cleaned, "").to_string();
61
62 let comment_re = regex::Regex::new(r"(?s)<!--.*?-->").unwrap();
64 cleaned = comment_re.replace_all(&cleaned, "").to_string();
65
66 let event_re = regex::Regex::new(r#"\son\w+\s*=\s*["'][^"']*["']"#).unwrap();
68 cleaned = event_re.replace_all(&cleaned, "").to_string();
69
70 let js_protocol_re = regex::Regex::new(r#"javascript:[^"'\s>]*"#).unwrap();
72 cleaned = js_protocol_re.replace_all(&cleaned, "").to_string();
73
74 let script_selector = Selector::parse("script").unwrap();
76 for element in document.select(&script_selector) {
77 if let Some(html) = element.html().get(0..100) {
78 cleaned = cleaned.replace(html, "");
79 }
80 }
81
82 let style_selector = Selector::parse("style").unwrap();
84 for element in document.select(&style_selector) {
85 if let Some(html) = element.html().get(0..100) {
86 cleaned = cleaned.replace(html, "");
87 }
88 }
89
90 Ok(Html::parse_document(&cleaned))
91 }
92
93 pub fn get_candidate_nodes(document: &Html, top_k: usize) -> Vec<ElementRef<'_>> {
95 let mut candidates = Vec::new();
96
97 let article_selector = Selector::parse("article").unwrap();
99 for element in document.select(&article_selector) {
100 candidates.push(element);
101 }
102
103 let div_selector = Selector::parse("div").unwrap();
105 for element in document.select(&div_selector) {
106 candidates.push(element);
107 }
108
109 let section_selector = Selector::parse("section").unwrap();
111 for element in document.select(§ion_selector) {
112 candidates.push(element);
113 }
114
115 candidates.truncate(top_k);
116 candidates
117 }
118
119 pub fn extract_paragraphs(element: ElementRef) -> Vec<String> {
121 let p_selector = Selector::parse("p").unwrap();
122
123 element.select(&p_selector)
124 .map(|p| Self::extract_text(p).trim().to_string())
125 .filter(|text| !text.is_empty())
126 .collect()
127 }
128
129 pub fn get_parent(element: ElementRef) -> Option<ElementRef> {
131 element.parent().and_then(ElementRef::wrap)
132 }
133
134 pub fn get_prev_sibling(element: ElementRef) -> Option<ElementRef> {
136 element.prev_sibling_element()
137 }
138
139 pub fn get_next_sibling(element: ElementRef) -> Option<ElementRef> {
141 element.next_sibling_element()
142 }
143
144 pub fn count_children(element: ElementRef) -> usize {
146 element.children().filter(|n| n.value().is_element()).count()
147 }
148
149 pub fn get_tree_depth(document: &Html) -> usize {
151 fn depth_recursive(element: ElementRef) -> usize {
152 let children: Vec<_> = element.children()
153 .filter_map(ElementRef::wrap)
154 .collect();
155
156 if children.is_empty() {
157 1
158 } else {
159 1 + children.into_iter()
160 .map(depth_recursive)
161 .max()
162 .unwrap_or(0)
163 }
164 }
165
166 document.root_element()
167 .children()
168 .filter_map(ElementRef::wrap)
169 .map(depth_recursive)
170 .max()
171 .unwrap_or(0)
172 }
173
174 pub fn get_node_depth(element: ElementRef) -> usize {
176 let mut depth = 0;
177 let mut current = Some(element);
178
179 while let Some(elem) = current {
180 depth += 1;
181 current = elem.parent().and_then(ElementRef::wrap);
182 }
183
184 depth
185 }
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191
192 #[test]
193 fn test_parse_html() {
194 let html = r#"<html><body><p>Hello World</p></body></html>"#;
195 let doc = HtmlParser::parse(html).unwrap();
196 assert!(doc.root_element().html().contains("Hello World"));
197 }
198
199 #[test]
200 fn test_extract_paragraphs() {
201 let html = r#"
202 <article>
203 <p>First paragraph.</p>
204 <p>Second paragraph.</p>
205 </article>
206 "#;
207 let doc = HtmlParser::parse(html).unwrap();
208 let article = doc.select(&Selector::parse("article").unwrap()).next().unwrap();
209 let paragraphs = HtmlParser::extract_paragraphs(article);
210 assert_eq!(paragraphs.len(), 2);
211 }
212}