Skip to main content

node_html_parser/parser/
validator.rs

1//! HTML validation engine for checking well-formed markup.
2
3use crate::dom::void_tag::VoidTag;
4use regex::Regex;
5use std::sync::OnceLock;
6
7use super::types::Options;
8
9static VALID_TAG_REGEX: OnceLock<Regex> = OnceLock::new();
10
11/// 验证HTML是否有效
12///
13/// 解析HTML并检查栈长度是否为1,类似于js/node-html-parser中的valid函数
14///
15/// # 参数
16///
17/// * `input` - 要解析的HTML字符串
18/// * `opts` - 解析选项
19///
20/// # 返回值
21///
22/// 如果HTML有效(即栈长度为1)返回true,否则返回false
23pub fn valid(input: &str, opts: &Options) -> bool {
24	// 依据 JS 版本:仅看解析后栈是否仅包含 root
25	// 这里实现一个轻量 base_parse,与上方 parse 主逻辑分离,尽量复刻 js/node-html-parser/nodes/html.ts 中 base_parse 的栈规则
26
27	// void & frameflag 包装
28	const FRAMEFLAG: &str = "documentfragmentcontainer";
29	let data = format!("<{}>{}</{}>", FRAMEFLAG, input, FRAMEFLAG);
30	// data_end_virtual 未使用,移除以消除编译警告
31
32	let void_tag = VoidTag::new(&opts.void_tag);
33	// 与 parse_with_options 同步的(精简)Unicode tag name 支持:
34	let tag_re = VALID_TAG_REGEX.get_or_init(|| {
35		Regex::new(r"<!--[\s\S]*?-->|<(\/)?([A-Za-z][-.:0-9_A-Za-z@\p{L}\p{M}]*)([^>]*)>").unwrap()
36	});
37
38	// 对齐 JS 关闭规则映射
39	use std::collections::HashMap; // 局部使用,避免与上面 parse 冲突
40	let mut closed_by_open: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
41	macro_rules! c_open {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_open.insert($p,m.clone()); closed_by_open.insert($p.to_uppercase().leak(),m);}}}
42	c_open!("li", ["li", "LI"]);
43	c_open!("p", ["p", "P", "div", "DIV"]);
44	c_open!("b", ["div", "DIV"]);
45	c_open!("td", ["td", "th", "TD", "TH"]);
46	c_open!("th", ["td", "th", "TD", "TH"]);
47	c_open!("h1", ["h1", "H1"]);
48	c_open!("h2", ["h2", "H2"]);
49	c_open!("h3", ["h3", "H3"]);
50	c_open!("h4", ["h4", "H4"]);
51	c_open!("h5", ["h5", "H5"]);
52	c_open!("h6", ["h6", "H6"]);
53
54	let mut closed_by_close: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
55	macro_rules! c_close {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_close.insert($p,m.clone()); closed_by_close.insert($p.to_uppercase().leak(),m);}}}
56	c_close!("li", ["ul", "ol", "UL", "OL"]);
57	c_close!("a", ["div", "DIV"]);
58	c_close!("b", ["div", "DIV"]);
59	c_close!("i", ["div", "DIV"]);
60	c_close!("p", ["div", "DIV"]);
61	c_close!("td", ["tr", "table", "TR", "TABLE"]);
62	c_close!("th", ["tr", "table", "TR", "TABLE"]);
63
64	#[derive(Clone)]
65	struct SimpleEl {
66		raw: String,
67	}
68	let mut stack: Vec<SimpleEl> = vec![SimpleEl {
69		raw: "#root".into(),
70	}];
71	// block text elements: 其内部视为原始文本,不再解析标签(与 JS 一致)
72	let mut pos = 0usize;
73	let block_text: std::collections::HashSet<&'static str> =
74		["script", "style", "pre", "noscript"].into_iter().collect();
75	while let Some(m) = tag_re.find_at(&data, pos) {
76		let full = m.as_str();
77		pos = m.end();
78		if full.starts_with("<!--") {
79			continue;
80		}
81		let caps = tag_re.captures(full).unwrap();
82		let leading_slash = caps.get(1).map(|c| c.as_str()).unwrap_or("");
83		let tag_name_raw = caps.get(2).map(|c| c.as_str()).unwrap_or("");
84		let mut tag_name = tag_name_raw.to_string();
85		let tag_name_lc = tag_name_raw.to_ascii_lowercase();
86		if opts.lower_case_tag_name {
87			tag_name = tag_name_lc.clone();
88		}
89		if tag_name_lc == FRAMEFLAG {
90			continue;
91		}
92		let attr_part = caps.get(3).map(|c| c.as_str()).unwrap_or("");
93		// 兼容 issue_227 与 frameflag 包装:若错误地把 frameflag 的关闭标签吞进属性(出现 '</documentfragmentcontainer'),说明这是跨越边界的伪匹配,跳过
94		if leading_slash.is_empty() && attr_part.contains("</documentfragmentcontainer") {
95			continue;
96		}
97		let self_close = attr_part.trim_end().ends_with('/') || void_tag.is_void(&tag_name_lc);
98		if leading_slash.is_empty() {
99			// opening tag
100			if let Some(parent) = stack.last() {
101				if let Some(map) = closed_by_open.get(parent.raw.as_str()) {
102					if map.contains_key(tag_name.as_str()) {
103						stack.pop();
104					}
105				}
106			}
107			if !self_close && !void_tag.is_void(&tag_name_lc) {
108				let is_block = block_text.contains(tag_name_lc.as_str());
109				stack.push(SimpleEl {
110					raw: if opts.lower_case_tag_name {
111						tag_name_lc.clone()
112					} else {
113						tag_name.clone()
114					},
115				});
116				if is_block {
117					// 查找对应关闭标签(大小写不敏感)
118					let close_pat = format!("</{}>", tag_name_lc);
119					// 为避免多次分配,做一次 to_lowercase() 子串搜索
120					if let Some(rel_idx) = data[pos..].to_ascii_lowercase().find(&close_pat) {
121						let close_start = pos + rel_idx; // '<' of closing
122									   // 找到 '>'
123						if let Some(gt_rel) = data[close_start..].find('>') {
124							// pop block 元素(模拟遇到关闭标签)
125							stack.pop();
126							pos = close_start + gt_rel + 1; // 跳过整个关闭标签
127							continue; // 继续下一轮 find_at
128						} else {
129							// 没有 '>',视为未闭合,结束
130							break;
131						}
132					} else {
133						// 未找到关闭,视为未闭合 => 结束
134						break;
135					}
136				}
137			}
138		} else {
139			let target_lc = tag_name_lc.as_str();
140			// closing tag
141			loop {
142				if let Some(top) = stack.last() {
143					if top.raw.eq(target_lc) || top.raw.eq(tag_name.as_str()) {
144						stack.pop();
145						break;
146					}
147					if let Some(map) = closed_by_close.get(top.raw.as_str()) {
148						if map.contains_key(tag_name.as_str()) || map.contains_key(target_lc) {
149							stack.pop();
150							continue;
151						}
152					}
153				}
154				break;
155			}
156		}
157	}
158	// JS: stack length==1 表明完整闭合
159	let ok = stack.len() == 1;
160	ok
161}