use scraper::{Html, Selector, ElementRef, Element};
use crate::Result;
pub struct HtmlParser;
impl HtmlParser {
pub fn parse(html: &str) -> Result<Html> {
Ok(Html::parse_document(html))
}
pub fn extract_text(element: ElementRef) -> String {
element.text().collect::<Vec<_>>().join(" ")
}
pub fn get_element_path(element: ElementRef) -> String {
let mut path = Vec::new();
let mut current = Some(element);
while let Some(elem) = current {
let tag = elem.value().name();
let position = elem.prev_siblings()
.filter(|s| s.value().as_element().is_some_and(|e| e.name() == tag))
.count() + 1;
path.push(format!("{}[{}]", tag, position));
current = elem.parent().and_then(ElementRef::wrap);
}
path.reverse();
format!("/{}", path.join("/"))
}
pub fn clean_html(html: &str) -> Result<Html> {
let document = Html::parse_document(html);
let mut cleaned = html.to_string();
let script_re = regex::Regex::new(r"(?is)<script\b[^>]*>.*?</script>").unwrap();
cleaned = script_re.replace_all(&cleaned, "").to_string();
let style_re = regex::Regex::new(r"(?is)<style\b[^>]*>.*?</style>").unwrap();
cleaned = style_re.replace_all(&cleaned, "").to_string();
let comment_re = regex::Regex::new(r"(?s)<!--.*?-->").unwrap();
cleaned = comment_re.replace_all(&cleaned, "").to_string();
let event_re = regex::Regex::new(r#"\son\w+\s*=\s*["'][^"']*["']"#).unwrap();
cleaned = event_re.replace_all(&cleaned, "").to_string();
let js_protocol_re = regex::Regex::new(r#"javascript:[^"'\s>]*"#).unwrap();
cleaned = js_protocol_re.replace_all(&cleaned, "").to_string();
let script_selector = Selector::parse("script").unwrap();
for element in document.select(&script_selector) {
if let Some(html) = element.html().get(0..100) {
cleaned = cleaned.replace(html, "");
}
}
let style_selector = Selector::parse("style").unwrap();
for element in document.select(&style_selector) {
if let Some(html) = element.html().get(0..100) {
cleaned = cleaned.replace(html, "");
}
}
Ok(Html::parse_document(&cleaned))
}
pub fn get_candidate_nodes(document: &Html, top_k: usize) -> Vec<ElementRef<'_>> {
let mut candidates = Vec::new();
let article_selector = Selector::parse("article").unwrap();
for element in document.select(&article_selector) {
candidates.push(element);
}
let div_selector = Selector::parse("div").unwrap();
for element in document.select(&div_selector) {
candidates.push(element);
}
let section_selector = Selector::parse("section").unwrap();
for element in document.select(§ion_selector) {
candidates.push(element);
}
candidates.truncate(top_k);
candidates
}
pub fn extract_paragraphs(element: ElementRef) -> Vec<String> {
let p_selector = Selector::parse("p").unwrap();
element.select(&p_selector)
.map(|p| Self::extract_text(p).trim().to_string())
.filter(|text| !text.is_empty())
.collect()
}
pub fn get_parent(element: ElementRef) -> Option<ElementRef> {
element.parent().and_then(ElementRef::wrap)
}
pub fn get_prev_sibling(element: ElementRef) -> Option<ElementRef> {
element.prev_sibling_element()
}
pub fn get_next_sibling(element: ElementRef) -> Option<ElementRef> {
element.next_sibling_element()
}
pub fn count_children(element: ElementRef) -> usize {
element.children().filter(|n| n.value().is_element()).count()
}
pub fn get_tree_depth(document: &Html) -> usize {
fn depth_recursive(element: ElementRef) -> usize {
let children: Vec<_> = element.children()
.filter_map(ElementRef::wrap)
.collect();
if children.is_empty() {
1
} else {
1 + children.into_iter()
.map(depth_recursive)
.max()
.unwrap_or(0)
}
}
document.root_element()
.children()
.filter_map(ElementRef::wrap)
.map(depth_recursive)
.max()
.unwrap_or(0)
}
pub fn get_node_depth(element: ElementRef) -> usize {
let mut depth = 0;
let mut current = Some(element);
while let Some(elem) = current {
depth += 1;
current = elem.parent().and_then(ElementRef::wrap);
}
depth
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_html() {
let html = r#"<html><body><p>Hello World</p></body></html>"#;
let doc = HtmlParser::parse(html).unwrap();
assert!(doc.root_element().html().contains("Hello World"));
}
#[test]
fn test_extract_paragraphs() {
let html = r#"
<article>
<p>First paragraph.</p>
<p>Second paragraph.</p>
</article>
"#;
let doc = HtmlParser::parse(html).unwrap();
let article = doc.select(&Selector::parse("article").unwrap()).next().unwrap();
let paragraphs = HtmlParser::extract_paragraphs(article);
assert_eq!(paragraphs.len(), 2);
}
}