node_html_parser/parser/
validator.rs1use crate::dom::void_tag::VoidTag;
4use regex::Regex;
5use std::sync::OnceLock;
6
7use super::types::Options;
8
9static VALID_TAG_REGEX: OnceLock<Regex> = OnceLock::new();
10
11pub fn valid(input: &str, opts: &Options) -> bool {
24 const FRAMEFLAG: &str = "documentfragmentcontainer";
29 let data = format!("<{}>{}</{}>", FRAMEFLAG, input, FRAMEFLAG);
30 let void_tag = VoidTag::new(&opts.void_tag);
33 let tag_re = VALID_TAG_REGEX.get_or_init(|| {
35 Regex::new(r"<!--[\s\S]*?-->|<(\/)?([A-Za-z][-.:0-9_A-Za-z@\p{L}\p{M}]*)([^>]*)>").unwrap()
36 });
37
38 use std::collections::HashMap; let mut closed_by_open: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
41 macro_rules! c_open {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_open.insert($p,m.clone()); closed_by_open.insert($p.to_uppercase().leak(),m);}}}
42 c_open!("li", ["li", "LI"]);
43 c_open!("p", ["p", "P", "div", "DIV"]);
44 c_open!("b", ["div", "DIV"]);
45 c_open!("td", ["td", "th", "TD", "TH"]);
46 c_open!("th", ["td", "th", "TD", "TH"]);
47 c_open!("h1", ["h1", "H1"]);
48 c_open!("h2", ["h2", "H2"]);
49 c_open!("h3", ["h3", "H3"]);
50 c_open!("h4", ["h4", "H4"]);
51 c_open!("h5", ["h5", "H5"]);
52 c_open!("h6", ["h6", "H6"]);
53
54 let mut closed_by_close: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
55 macro_rules! c_close {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_close.insert($p,m.clone()); closed_by_close.insert($p.to_uppercase().leak(),m);}}}
56 c_close!("li", ["ul", "ol", "UL", "OL"]);
57 c_close!("a", ["div", "DIV"]);
58 c_close!("b", ["div", "DIV"]);
59 c_close!("i", ["div", "DIV"]);
60 c_close!("p", ["div", "DIV"]);
61 c_close!("td", ["tr", "table", "TR", "TABLE"]);
62 c_close!("th", ["tr", "table", "TR", "TABLE"]);
63
64 #[derive(Clone)]
65 struct SimpleEl {
66 raw: String,
67 }
68 let mut stack: Vec<SimpleEl> = vec![SimpleEl {
69 raw: "#root".into(),
70 }];
71 let mut pos = 0usize;
73 let block_text: std::collections::HashSet<&'static str> =
74 ["script", "style", "pre", "noscript"].into_iter().collect();
75 while let Some(m) = tag_re.find_at(&data, pos) {
76 let full = m.as_str();
77 pos = m.end();
78 if full.starts_with("<!--") {
79 continue;
80 }
81 let caps = tag_re.captures(full).unwrap();
82 let leading_slash = caps.get(1).map(|c| c.as_str()).unwrap_or("");
83 let tag_name_raw = caps.get(2).map(|c| c.as_str()).unwrap_or("");
84 let mut tag_name = tag_name_raw.to_string();
85 let tag_name_lc = tag_name_raw.to_ascii_lowercase();
86 if opts.lower_case_tag_name {
87 tag_name = tag_name_lc.clone();
88 }
89 if tag_name_lc == FRAMEFLAG {
90 continue;
91 }
92 let attr_part = caps.get(3).map(|c| c.as_str()).unwrap_or("");
93 if leading_slash.is_empty() && attr_part.contains("</documentfragmentcontainer") {
95 continue;
96 }
97 let self_close = attr_part.trim_end().ends_with('/') || void_tag.is_void(&tag_name_lc);
98 if leading_slash.is_empty() {
99 if let Some(parent) = stack.last() {
101 if let Some(map) = closed_by_open.get(parent.raw.as_str()) {
102 if map.contains_key(tag_name.as_str()) {
103 stack.pop();
104 }
105 }
106 }
107 if !self_close && !void_tag.is_void(&tag_name_lc) {
108 let is_block = block_text.contains(tag_name_lc.as_str());
109 stack.push(SimpleEl {
110 raw: if opts.lower_case_tag_name {
111 tag_name_lc.clone()
112 } else {
113 tag_name.clone()
114 },
115 });
116 if is_block {
117 let close_pat = format!("</{}>", tag_name_lc);
119 if let Some(rel_idx) = data[pos..].to_ascii_lowercase().find(&close_pat) {
121 let close_start = pos + rel_idx; if let Some(gt_rel) = data[close_start..].find('>') {
124 stack.pop();
126 pos = close_start + gt_rel + 1; continue; } else {
129 break;
131 }
132 } else {
133 break;
135 }
136 }
137 }
138 } else {
139 let target_lc = tag_name_lc.as_str();
140 loop {
142 if let Some(top) = stack.last() {
143 if top.raw.eq(target_lc) || top.raw.eq(tag_name.as_str()) {
144 stack.pop();
145 break;
146 }
147 if let Some(map) = closed_by_close.get(top.raw.as_str()) {
148 if map.contains_key(tag_name.as_str()) || map.contains_key(target_lc) {
149 stack.pop();
150 continue;
151 }
152 }
153 }
154 break;
155 }
156 }
157 }
158 let ok = stack.len() == 1;
160 ok
161}