halldyll_core/parse/
html.rs

1//! HTML - Robust HTML parsing
2
3use scraper::{Html, Selector, ElementRef};
4use encoding_rs::Encoding;
5
6/// HTML Parser
7pub struct HtmlParser {
8    document: Html,
9}
10
11impl HtmlParser {
12    /// Parse an HTML document
13    pub fn parse(html: &str) -> Self {
14        let document = Html::parse_document(html);
15        Self { document }
16    }
17
18    /// Parse with encoding detection
19    pub fn parse_with_encoding(bytes: &[u8], declared_charset: Option<&str>) -> Self {
20        let (html, _) = Self::decode_html(bytes, declared_charset);
21        Self::parse(&html)
22    }
23
24    /// Decode HTML with the correct encoding
25    fn decode_html(bytes: &[u8], declared_charset: Option<&str>) -> (String, &'static Encoding) {
26        // 1. Try the declared charset
27        if let Some(charset) = declared_charset {
28            if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
29                let (decoded, _, _) = encoding.decode(bytes);
30                return (decoded.into_owned(), encoding);
31            }
32        }
33
34        // 2. Detect from meta tags (first 1024 bytes)
35        let peek = &bytes[..std::cmp::min(1024, bytes.len())];
36        let peek_str = String::from_utf8_lossy(peek);
37        
38        if let Some(charset) = Self::detect_meta_charset(&peek_str) {
39            if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
40                let (decoded, _, _) = encoding.decode(bytes);
41                return (decoded.into_owned(), encoding);
42            }
43        }
44
45        // 3. Default UTF-8
46        let (decoded, _, _) = encoding_rs::UTF_8.decode(bytes);
47        (decoded.into_owned(), encoding_rs::UTF_8)
48    }
49
50    /// Detect charset from meta tags
51    fn detect_meta_charset(html: &str) -> Option<String> {
52        let html_lower = html.to_lowercase();
53        
54        // <meta charset="...">
55        if let Some(pos) = html_lower.find("charset=") {
56            let rest = &html[pos + 8..];
57            let charset: String = rest
58                .chars()
59                .skip_while(|&c| c == '"' || c == '\'')
60                .take_while(|&c| c != '"' && c != '\'' && c != ' ' && c != ';' && c != '>')
61                .collect();
62            if !charset.is_empty() {
63                return Some(charset);
64            }
65        }
66
67        None
68    }
69
70    /// Access to the parsed document
71    pub fn document(&self) -> &Html {
72        &self.document
73    }
74
75    /// Select elements
76    pub fn select(&self, selector: &str) -> Vec<ElementRef<'_>> {
77        match Selector::parse(selector) {
78            Ok(sel) => self.document.select(&sel).collect(),
79            Err(_) => Vec::new(),
80        }
81    }
82
83    /// Select the first element
84    pub fn select_first(&self, selector: &str) -> Option<ElementRef<'_>> {
85        Selector::parse(selector)
86            .ok()
87            .and_then(|sel| self.document.select(&sel).next())
88    }
89
90    /// Get the text of an element
91    pub fn text(&self, selector: &str) -> Option<String> {
92        self.select_first(selector)
93            .map(|el| el.text().collect::<Vec<_>>().join(" ").trim().to_string())
94    }
95
96    /// Get an attribute of an element
97    pub fn attr(&self, selector: &str, attr: &str) -> Option<String> {
98        self.select_first(selector)
99            .and_then(|el| el.value().attr(attr).map(String::from))
100    }
101
102    /// Get the inner HTML of an element
103    pub fn inner_html(&self, selector: &str) -> Option<String> {
104        self.select_first(selector).map(|el| el.inner_html())
105    }
106
107    /// Check if an element exists
108    pub fn exists(&self, selector: &str) -> bool {
109        self.select_first(selector).is_some()
110    }
111
112    /// Count elements
113    pub fn count(&self, selector: &str) -> usize {
114        self.select(selector).len()
115    }
116}
117
118/// Clean HTML (remove scripts, styles, etc.)
119pub fn sanitize_html(html: &str) -> String {
120    let document = Html::parse_document(html);
121    let mut output = String::new();
122
123    // Selectors to exclude
124    let exclude_selectors = [
125        "script", "style", "noscript", "iframe", "svg", "canvas",
126        "template", "object", "embed", "applet",
127    ];
128
129    // Traverse and rebuild
130    for node in document.root_element().descendants() {
131        if let Some(element) = node.value().as_element() {
132            let tag_name = element.name();
133            if !exclude_selectors.contains(&tag_name) {
134                // Add the text
135                if let Some(text) = node.value().as_text() {
136                    let text = text.trim();
137                    if !text.is_empty() {
138                        output.push_str(text);
139                        output.push(' ');
140                    }
141                }
142            }
143        }
144    }
145
146    output.trim().to_string()
147}