Skip to main content

content_extractor_rl/
html_parser.rs

1//! HTML parsing and DOM manipulation utilities
2
3// ============================================================================
4// FILE: crates/content-extractor-rl/src/html_parser.rs
5// ============================================================================
6
7use scraper::{Html, Selector, ElementRef, Element};
8use crate::Result;
9
10
11pub struct HtmlParser;
12
13impl HtmlParser {
14
15    /// Parse HTML string into document
16    pub fn parse(html: &str) -> Result<Html> {
17        Ok(Html::parse_document(html))
18    }
19
20    /// Extract text content from element
21    pub fn extract_text(element: ElementRef) -> String {
22        element.text().collect::<Vec<_>>().join(" ")
23    }
24
25    /// Get XPath-like selector for element
26    pub fn get_element_path(element: ElementRef) -> String {
27        let mut path = Vec::new();
28        let mut current = Some(element);
29
30        while let Some(elem) = current {
31            let tag = elem.value().name();
32
33            // Get position among siblings
34            let position = elem.prev_siblings()
35                .filter(|s| s.value().as_element().is_some_and(|e| e.name() == tag))
36                .count() + 1;
37
38            path.push(format!("{}[{}]", tag, position));
39            current = elem.parent().and_then(ElementRef::wrap);
40        }
41
42        path.reverse();
43        format!("/{}", path.join("/"))
44    }
45
46
47    /// Clean HTML by removing script, style, comments, etc.
48    pub fn clean_html(html: &str) -> Result<Html> {
49        let document = Html::parse_document(html);
50
51        // Create cleaned HTML string (simplified - proper cleaning would modify DOM)
52        let mut cleaned = html.to_string();
53
54        // Remove script tags and their content (case-insensitive, multiline)
55        let script_re = regex::Regex::new(r"(?is)<script\b[^>]*>.*?</script>").unwrap();
56        cleaned = script_re.replace_all(&cleaned, "").to_string();
57
58        // Remove style tags and their content
59        let style_re = regex::Regex::new(r"(?is)<style\b[^>]*>.*?</style>").unwrap();
60        cleaned = style_re.replace_all(&cleaned, "").to_string();
61
62        // Remove HTML comments
63        let comment_re = regex::Regex::new(r"(?s)<!--.*?-->").unwrap();
64        cleaned = comment_re.replace_all(&cleaned, "").to_string();
65
66        // Remove inline JavaScript event handlers
67        let event_re = regex::Regex::new(r#"\son\w+\s*=\s*["'][^"']*["']"#).unwrap();
68        cleaned = event_re.replace_all(&cleaned, "").to_string();
69
70        // Remove JavaScript: protocol links
71        let js_protocol_re = regex::Regex::new(r#"javascript:[^"'\s>]*"#).unwrap();
72        cleaned = js_protocol_re.replace_all(&cleaned, "").to_string();
73
74        // Remove script tags
75        let script_selector = Selector::parse("script").unwrap();
76        for element in document.select(&script_selector) {
77            if let Some(html) = element.html().get(0..100) {
78                cleaned = cleaned.replace(html, "");
79            }
80        }
81
82        // Remove style tags
83        let style_selector = Selector::parse("style").unwrap();
84        for element in document.select(&style_selector) {
85            if let Some(html) = element.html().get(0..100) {
86                cleaned = cleaned.replace(html, "");
87            }
88        }
89
90        Ok(Html::parse_document(&cleaned))
91    }
92
93    /// Get candidate article nodes from document
94    pub fn get_candidate_nodes(document: &Html, top_k: usize) -> Vec<ElementRef<'_>> {
95        let mut candidates = Vec::new();
96
97        // Try article tags first
98        let article_selector = Selector::parse("article").unwrap();
99        for element in document.select(&article_selector) {
100            candidates.push(element);
101        }
102
103        // Try divs
104        let div_selector = Selector::parse("div").unwrap();
105        for element in document.select(&div_selector) {
106            candidates.push(element);
107        }
108
109        // Try sections
110        let section_selector = Selector::parse("section").unwrap();
111        for element in document.select(&section_selector) {
112            candidates.push(element);
113        }
114
115        candidates.truncate(top_k);
116        candidates
117    }
118
119    /// Extract paragraphs from element
120    pub fn extract_paragraphs(element: ElementRef) -> Vec<String> {
121        let p_selector = Selector::parse("p").unwrap();
122
123        element.select(&p_selector)
124            .map(|p| Self::extract_text(p).trim().to_string())
125            .filter(|text| !text.is_empty())
126            .collect()
127    }
128
129    /// Get parent element
130    pub fn get_parent(element: ElementRef) -> Option<ElementRef> {
131        element.parent().and_then(ElementRef::wrap)
132    }
133
134    /// Get previous sibling element
135    pub fn get_prev_sibling(element: ElementRef) -> Option<ElementRef> {
136        element.prev_sibling_element()
137    }
138
139    /// Get next sibling element
140    pub fn get_next_sibling(element: ElementRef) -> Option<ElementRef> {
141        element.next_sibling_element()
142    }
143
144    /// Count child elements
145    pub fn count_children(element: ElementRef) -> usize {
146        element.children().filter(|n| n.value().is_element()).count()
147    }
148
149    /// Get tree depth
150    pub fn get_tree_depth(document: &Html) -> usize {
151        fn depth_recursive(element: ElementRef) -> usize {
152            let children: Vec<_> = element.children()
153                .filter_map(ElementRef::wrap)
154                .collect();
155
156            if children.is_empty() {
157                1
158            } else {
159                1 + children.into_iter()
160                    .map(depth_recursive)
161                    .max()
162                    .unwrap_or(0)
163            }
164        }
165
166        document.root_element()
167            .children()
168            .filter_map(ElementRef::wrap)
169            .map(depth_recursive)
170            .max()
171            .unwrap_or(0)
172    }
173
174    /// Get node depth in tree
175    pub fn get_node_depth(element: ElementRef) -> usize {
176        let mut depth = 0;
177        let mut current = Some(element);
178
179        while let Some(elem) = current {
180            depth += 1;
181            current = elem.parent().and_then(ElementRef::wrap);
182        }
183
184        depth
185    }
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    #[test]
193    fn test_parse_html() {
194        let html = r#"<html><body><p>Hello World</p></body></html>"#;
195        let doc = HtmlParser::parse(html).unwrap();
196        assert!(doc.root_element().html().contains("Hello World"));
197    }
198
199    #[test]
200    fn test_extract_paragraphs() {
201        let html = r#"
202            <article>
203                <p>First paragraph.</p>
204                <p>Second paragraph.</p>
205            </article>
206        "#;
207        let doc = HtmlParser::parse(html).unwrap();
208        let article = doc.select(&Selector::parse("article").unwrap()).next().unwrap();
209        let paragraphs = HtmlParser::extract_paragraphs(article);
210        assert_eq!(paragraphs.len(), 2);
211    }
212}