halldyll_core/parse/
html.rs1use scraper::{Html, Selector, ElementRef};
4use encoding_rs::Encoding;
5
6pub struct HtmlParser {
8 document: Html,
9}
10
11impl HtmlParser {
12 pub fn parse(html: &str) -> Self {
14 let document = Html::parse_document(html);
15 Self { document }
16 }
17
18 pub fn parse_with_encoding(bytes: &[u8], declared_charset: Option<&str>) -> Self {
20 let (html, _) = Self::decode_html(bytes, declared_charset);
21 Self::parse(&html)
22 }
23
24 fn decode_html(bytes: &[u8], declared_charset: Option<&str>) -> (String, &'static Encoding) {
26 if let Some(charset) = declared_charset {
28 if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
29 let (decoded, _, _) = encoding.decode(bytes);
30 return (decoded.into_owned(), encoding);
31 }
32 }
33
34 let peek = &bytes[..std::cmp::min(1024, bytes.len())];
36 let peek_str = String::from_utf8_lossy(peek);
37
38 if let Some(charset) = Self::detect_meta_charset(&peek_str) {
39 if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
40 let (decoded, _, _) = encoding.decode(bytes);
41 return (decoded.into_owned(), encoding);
42 }
43 }
44
45 let (decoded, _, _) = encoding_rs::UTF_8.decode(bytes);
47 (decoded.into_owned(), encoding_rs::UTF_8)
48 }
49
50 fn detect_meta_charset(html: &str) -> Option<String> {
52 let html_lower = html.to_lowercase();
53
54 if let Some(pos) = html_lower.find("charset=") {
56 let rest = &html[pos + 8..];
57 let charset: String = rest
58 .chars()
59 .skip_while(|&c| c == '"' || c == '\'')
60 .take_while(|&c| c != '"' && c != '\'' && c != ' ' && c != ';' && c != '>')
61 .collect();
62 if !charset.is_empty() {
63 return Some(charset);
64 }
65 }
66
67 None
68 }
69
70 pub fn document(&self) -> &Html {
72 &self.document
73 }
74
75 pub fn select(&self, selector: &str) -> Vec<ElementRef<'_>> {
77 match Selector::parse(selector) {
78 Ok(sel) => self.document.select(&sel).collect(),
79 Err(_) => Vec::new(),
80 }
81 }
82
83 pub fn select_first(&self, selector: &str) -> Option<ElementRef<'_>> {
85 Selector::parse(selector)
86 .ok()
87 .and_then(|sel| self.document.select(&sel).next())
88 }
89
90 pub fn text(&self, selector: &str) -> Option<String> {
92 self.select_first(selector)
93 .map(|el| el.text().collect::<Vec<_>>().join(" ").trim().to_string())
94 }
95
96 pub fn attr(&self, selector: &str, attr: &str) -> Option<String> {
98 self.select_first(selector)
99 .and_then(|el| el.value().attr(attr).map(String::from))
100 }
101
102 pub fn inner_html(&self, selector: &str) -> Option<String> {
104 self.select_first(selector).map(|el| el.inner_html())
105 }
106
107 pub fn exists(&self, selector: &str) -> bool {
109 self.select_first(selector).is_some()
110 }
111
112 pub fn count(&self, selector: &str) -> usize {
114 self.select(selector).len()
115 }
116}
117
118pub fn sanitize_html(html: &str) -> String {
120 let document = Html::parse_document(html);
121 let mut output = String::new();
122
123 let exclude_selectors = [
125 "script", "style", "noscript", "iframe", "svg", "canvas",
126 "template", "object", "embed", "applet",
127 ];
128
129 for node in document.root_element().descendants() {
131 if let Some(element) = node.value().as_element() {
132 let tag_name = element.name();
133 if !exclude_selectors.contains(&tag_name) {
134 if let Some(text) = node.value().as_text() {
136 let text = text.trim();
137 if !text.is_empty() {
138 output.push_str(text);
139 output.push(' ');
140 }
141 }
142 }
143 }
144 }
145
146 output.trim().to_string()
147}