use crate::dom::void_tag::VoidTag;
use regex::Regex;
use std::sync::OnceLock;
use super::types::Options;
static VALID_TAG_REGEX: OnceLock<Regex> = OnceLock::new();
pub fn valid(input: &str, opts: &Options) -> bool {
const FRAMEFLAG: &str = "documentfragmentcontainer";
let data = format!("<{}>{}</{}>", FRAMEFLAG, input, FRAMEFLAG);
let void_tag = VoidTag::new(&opts.void_tag);
let tag_re = VALID_TAG_REGEX.get_or_init(|| {
Regex::new(r"<!--[\s\S]*?-->|<(\/)?([A-Za-z][-.:0-9_A-Za-z@\p{L}\p{M}]*)([^>]*)>").unwrap()
});
use std::collections::HashMap; let mut closed_by_open: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
macro_rules! c_open {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_open.insert($p,m.clone()); closed_by_open.insert($p.to_uppercase().leak(),m);}}}
c_open!("li", ["li", "LI"]);
c_open!("p", ["p", "P", "div", "DIV"]);
c_open!("b", ["div", "DIV"]);
c_open!("td", ["td", "th", "TD", "TH"]);
c_open!("th", ["td", "th", "TD", "TH"]);
c_open!("h1", ["h1", "H1"]);
c_open!("h2", ["h2", "H2"]);
c_open!("h3", ["h3", "H3"]);
c_open!("h4", ["h4", "H4"]);
c_open!("h5", ["h5", "H5"]);
c_open!("h6", ["h6", "H6"]);
let mut closed_by_close: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
macro_rules! c_close {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_close.insert($p,m.clone()); closed_by_close.insert($p.to_uppercase().leak(),m);}}}
c_close!("li", ["ul", "ol", "UL", "OL"]);
c_close!("a", ["div", "DIV"]);
c_close!("b", ["div", "DIV"]);
c_close!("i", ["div", "DIV"]);
c_close!("p", ["div", "DIV"]);
c_close!("td", ["tr", "table", "TR", "TABLE"]);
c_close!("th", ["tr", "table", "TR", "TABLE"]);
#[derive(Clone)]
struct SimpleEl {
raw: String,
}
let mut stack: Vec<SimpleEl> = vec![SimpleEl {
raw: "#root".into(),
}];
let mut pos = 0usize;
let block_text: std::collections::HashSet<&'static str> =
["script", "style", "pre", "noscript"].into_iter().collect();
while let Some(m) = tag_re.find_at(&data, pos) {
let full = m.as_str();
pos = m.end();
if full.starts_with("<!--") {
continue;
}
let caps = tag_re.captures(full).unwrap();
let leading_slash = caps.get(1).map(|c| c.as_str()).unwrap_or("");
let tag_name_raw = caps.get(2).map(|c| c.as_str()).unwrap_or("");
let mut tag_name = tag_name_raw.to_string();
let tag_name_lc = tag_name_raw.to_ascii_lowercase();
if opts.lower_case_tag_name {
tag_name = tag_name_lc.clone();
}
if tag_name_lc == FRAMEFLAG {
continue;
}
let attr_part = caps.get(3).map(|c| c.as_str()).unwrap_or("");
if leading_slash.is_empty() && attr_part.contains("</documentfragmentcontainer") {
continue;
}
let self_close = attr_part.trim_end().ends_with('/') || void_tag.is_void(&tag_name_lc);
if leading_slash.is_empty() {
if let Some(parent) = stack.last() {
if let Some(map) = closed_by_open.get(parent.raw.as_str()) {
if map.contains_key(tag_name.as_str()) {
stack.pop();
}
}
}
if !self_close && !void_tag.is_void(&tag_name_lc) {
let is_block = block_text.contains(tag_name_lc.as_str());
stack.push(SimpleEl {
raw: if opts.lower_case_tag_name {
tag_name_lc.clone()
} else {
tag_name.clone()
},
});
if is_block {
let close_pat = format!("</{}>", tag_name_lc);
if let Some(rel_idx) = data[pos..].to_ascii_lowercase().find(&close_pat) {
let close_start = pos + rel_idx; if let Some(gt_rel) = data[close_start..].find('>') {
stack.pop();
pos = close_start + gt_rel + 1; continue; } else {
break;
}
} else {
break;
}
}
}
} else {
let target_lc = tag_name_lc.as_str();
loop {
if let Some(top) = stack.last() {
if top.raw.eq(target_lc) || top.raw.eq(tag_name.as_str()) {
stack.pop();
break;
}
if let Some(map) = closed_by_close.get(top.raw.as_str()) {
if map.contains_key(tag_name.as_str()) || map.contains_key(target_lc) {
stack.pop();
continue;
}
}
}
break;
}
}
}
let ok = stack.len() == 1;
ok
}