use html5ever::{driver::parse_document, tendril::TendrilSink};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::{Captures, Regex};
use std::sync::LazyLock;
static SELFCLOSING_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"<([A-Za-z][A-Za-z0-9:-]*)([^<>]*?)/>"#).expect("self-closing tag regex is valid")
});
const VOID_TAGS: &[&str] = &[
"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
"track", "wbr", "command", "keygen",
];
pub(crate) fn parse_html(html: &str) -> Handle {
let html = normalize_self_closing_tags(html);
parse_document(RcDom::default(), Default::default())
.from_utf8()
.one(html.as_bytes())
.document
}
fn normalize_self_closing_tags(html: &str) -> String {
SELFCLOSING_RE
.replace_all(html, |captures: &Captures<'_>| {
let tag = captures[1].to_ascii_lowercase();
if VOID_TAGS.contains(&tag.as_str()) {
captures[0].to_owned()
} else {
format!("<{}{}></{}>", &captures[1], &captures[2], &captures[1])
}
})
.into_owned()
}
pub(crate) fn children(node: &Handle) -> Vec<Handle> {
node.children.borrow().iter().cloned().collect()
}
pub(crate) fn is_text(node: &Handle) -> bool {
matches!(node.data, NodeData::Text { .. })
}
pub(crate) fn text(node: &Handle) -> Option<String> {
match &node.data {
NodeData::Text { contents } => Some(contents.borrow().to_string()),
_ => None,
}
}
pub(crate) fn tag_name(node: &Handle) -> Option<String> {
match &node.data {
NodeData::Element { name, .. } => Some(name.local.to_string().to_ascii_uppercase()),
_ => None,
}
}
pub(crate) fn attr(node: &Handle, name: &str) -> Option<String> {
match &node.data {
NodeData::Element { attrs, .. } => attrs
.borrow()
.iter()
.find(|attr| attr.name.local.as_ref().eq_ignore_ascii_case(name))
.map(|attr| attr.value.to_string()),
_ => None,
}
}
pub(crate) fn inner_text(node: &Handle) -> String {
let mut result = String::new();
collect_inner_text(node, &mut result);
result
}
fn collect_inner_text(node: &Handle, result: &mut String) {
if let Some(text) = text(node) {
result.push_str(&text);
return;
}
for child in children(node) {
collect_inner_text(&child, result);
}
}