use scraper::{Html, Selector, ElementRef};
use encoding_rs::Encoding;
pub struct HtmlParser {
document: Html,
}
impl HtmlParser {
pub fn parse(html: &str) -> Self {
let document = Html::parse_document(html);
Self { document }
}
pub fn parse_with_encoding(bytes: &[u8], declared_charset: Option<&str>) -> Self {
let (html, _) = Self::decode_html(bytes, declared_charset);
Self::parse(&html)
}
fn decode_html(bytes: &[u8], declared_charset: Option<&str>) -> (String, &'static Encoding) {
if let Some(charset) = declared_charset {
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
let (decoded, _, _) = encoding.decode(bytes);
return (decoded.into_owned(), encoding);
}
}
let peek = &bytes[..std::cmp::min(1024, bytes.len())];
let peek_str = String::from_utf8_lossy(peek);
if let Some(charset) = Self::detect_meta_charset(&peek_str) {
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
let (decoded, _, _) = encoding.decode(bytes);
return (decoded.into_owned(), encoding);
}
}
let (decoded, _, _) = encoding_rs::UTF_8.decode(bytes);
(decoded.into_owned(), encoding_rs::UTF_8)
}
fn detect_meta_charset(html: &str) -> Option<String> {
let html_lower = html.to_lowercase();
if let Some(pos) = html_lower.find("charset=") {
let rest = &html[pos + 8..];
let charset: String = rest
.chars()
.skip_while(|&c| c == '"' || c == '\'')
.take_while(|&c| c != '"' && c != '\'' && c != ' ' && c != ';' && c != '>')
.collect();
if !charset.is_empty() {
return Some(charset);
}
}
None
}
pub fn document(&self) -> &Html {
&self.document
}
pub fn select(&self, selector: &str) -> Vec<ElementRef<'_>> {
match Selector::parse(selector) {
Ok(sel) => self.document.select(&sel).collect(),
Err(_) => Vec::new(),
}
}
pub fn select_first(&self, selector: &str) -> Option<ElementRef<'_>> {
Selector::parse(selector)
.ok()
.and_then(|sel| self.document.select(&sel).next())
}
pub fn text(&self, selector: &str) -> Option<String> {
self.select_first(selector)
.map(|el| el.text().collect::<Vec<_>>().join(" ").trim().to_string())
}
pub fn attr(&self, selector: &str, attr: &str) -> Option<String> {
self.select_first(selector)
.and_then(|el| el.value().attr(attr).map(String::from))
}
pub fn inner_html(&self, selector: &str) -> Option<String> {
self.select_first(selector).map(|el| el.inner_html())
}
pub fn exists(&self, selector: &str) -> bool {
self.select_first(selector).is_some()
}
pub fn count(&self, selector: &str) -> usize {
self.select(selector).len()
}
}
pub fn sanitize_html(html: &str) -> String {
let document = Html::parse_document(html);
let mut output = String::new();
let exclude_selectors = [
"script", "style", "noscript", "iframe", "svg", "canvas",
"template", "object", "embed", "applet",
];
for node in document.root_element().descendants() {
if let Some(element) = node.value().as_element() {
let tag_name = element.name();
if !exclude_selectors.contains(&tag_name) {
if let Some(text) = node.value().as_text() {
let text = text.trim();
if !text.is_empty() {
output.push_str(text);
output.push(' ');
}
}
}
}
}
output.trim().to_string()
}