node_html_parser/parser/
mod.rs

1//! Minimal Rust translation of core logic from js/node-html-parser.
2//! Provides a `parse` function returning the root Element whose
3//! children correspond to the original HTML fragment.
4use crate::dom::comment::CommentNode;
5use crate::dom::element::HTMLElement;
6use crate::dom::node::Node;
7use crate::dom::text::TextNode;
8use crate::dom::void_tag::{VoidTag, VoidTagOptions};
9use regex::Regex;
10use std::collections::HashMap;
11
12#[derive(Debug, Clone)]
13pub struct Options {
14	pub lower_case_tag_name: bool,
15	pub comment: bool,
16	/// Corresponds to js option fixNestedATags
17	pub fix_nested_a_tags: bool,
18	/// Parse not-closed tags (do not attempt JS style repair) -> corresponds to parseNoneClosedTags
19	pub parse_none_closed_tags: bool,
20	pub block_text_elements: HashMap<String, bool>, // tag -> ignore inner html when true
21	/// When true, even if block_text_elements requests extraction for script/style, we suppress
22	/// creating the inner raw Text node (used by tests expecting empty script/style by default).
23	pub suppress_script_style_text: bool,
24	pub void_tag: VoidTagOptions,
25}
26
27impl Default for Options {
28	fn default() -> Self {
29		let mut block = HashMap::new();
30		// 默认:script/style/noscript/pre 都作为 block 文本元素,捕获其原始文本(不解析内部标签)
31		block.insert("script".into(), true);
32		block.insert("style".into(), true);
33		block.insert("noscript".into(), true);
34		block.insert("pre".into(), true);
35		Self {
36			lower_case_tag_name: false,
37			comment: false,
38			fix_nested_a_tags: false,
39			parse_none_closed_tags: false,
40			block_text_elements: block,
41			suppress_script_style_text: false,
42			void_tag: Default::default(),
43		}
44	}
45}
46
47pub fn parse(input: &str) -> Box<HTMLElement> {
48	parse_with_options(input, &Options::default())
49}
50
51pub fn parse_with_options(input: &str, opts: &Options) -> Box<HTMLElement> {
52	// 改进:
53	// 1) 支持引号内含 '>'(通过 attrs 子模式保证只有引号外的 '>' 终止标签)
54	// 2) 扩展 tagName 的 Unicode 范围以对齐 JS 版本 kMarkupPattern,使自定义标签(含中文、阿拉伯等 BMP 范围字符)通过。
55	//    JS 的模式包含大量 Unicode 范围及高位平面;Rust regex 不支持 >BMP 直接类,这里截取到 BMP(满足当前测试需求)。
56	let tag_re = Regex::new(r#"<!--(?s:.*?)-->|<(\/)?([A-Za-z][A-Za-z0-9._:@\-\p{L}\p{M}]*)(?P<attrs>(?:[^>"']|"[^"]*"|'[^']*')*)(/?)>"#).unwrap();
57	let attr_key_re = Regex::new(
58		r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|[^\s>]+))?"#,
59	)
60	.unwrap();
61	let void_tag = VoidTag::new(&opts.void_tag);
62
63	// Elements closed by opening of specific following tags (subset of JS map)
64	let mut closed_by_open: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
65	// helper: insert mapping for both lower & upper parent names (mirror JS maps having LI & li etc.)
66	macro_rules! ins_open {
67		($parent:expr, [ $($v:expr),* ]) => {{
68			closed_by_open.insert($parent, vec![$($v),*]);
69			let up = $parent.to_uppercase();
70			if up != $parent { closed_by_open.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
71		}};
72	}
73	ins_open!("li", ["li", "LI"]);
74	ins_open!("p", ["p", "P", "div", "DIV"]);
75	ins_open!("b", ["div", "DIV"]);
76	ins_open!("td", ["td", "th", "TD", "TH"]);
77	ins_open!("th", ["td", "th", "TD", "TH"]);
78	ins_open!("h1", ["h1", "H1"]);
79	ins_open!("h2", ["h2", "H2"]);
80	ins_open!("h3", ["h3", "H3"]);
81	ins_open!("h4", ["h4", "H4"]);
82	ins_open!("h5", ["h5", "H5"]);
83	ins_open!("h6", ["h6", "H6"]);
84
85	let mut closed_by_close: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
86	macro_rules! ins_close {
87		($parent:expr, [ $($v:expr),* ]) => {{
88			closed_by_close.insert($parent, vec![$($v),*]);
89			let up = $parent.to_uppercase();
90			if up != $parent { closed_by_close.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
91		}};
92	}
93	ins_close!("li", ["ul", "UL", "ol", "OL"]);
94	ins_close!("a", ["div", "DIV"]);
95	ins_close!("b", ["div", "DIV"]);
96	ins_close!("i", ["div", "DIV"]);
97	ins_close!("p", ["div", "DIV"]);
98	ins_close!("td", ["tr", "TR", "table", "TABLE"]);
99	ins_close!("th", ["tr", "TR", "table", "TABLE"]);
100
101	#[derive(Clone)]
102	struct StackEntry {
103		elem: Box<HTMLElement>,
104	}
105	let root = Box::new(HTMLElement::new(
106		None,
107		String::new(),
108		Vec::new(),
109		false,
110		opts.void_tag.add_closing_slash,
111	));
112	let mut stack: Vec<StackEntry> = vec![StackEntry { elem: root }];
113	// Safe mutable access to set options flags on root
114	if let Some(first) = stack.last_mut() {
115		first.elem.parse_comment = opts.comment;
116		first.elem.parse_lowercase = opts.lower_case_tag_name;
117	}
118	let mut last_text_pos = 0usize;
119	let mut no_nested_a_index: Option<usize> = None;
120	// 记录在 block 文本处理阶段被直接消耗的关闭标签位置
121	let mut skipped_closing_starts: std::collections::HashSet<usize> =
122		std::collections::HashSet::new();
123
124	let frameflag = "documentfragmentcontainer";
125	let frame_prefix = format!("<{}>", frameflag);
126	let frame_suffix = format!("</{}>", frameflag);
127	let data = format!("{}{}{}", frame_prefix, input, frame_suffix);
128	let frame_offset = frame_prefix.len();
129	// 遍历标签匹配(正则已保证不会把引号内的 '>' 视作结束)
130	for m in tag_re.captures_iter(&data) {
131		let full = m.get(0).unwrap();
132		// 若该匹配为已消耗的 blockTextElements 的关闭标签,直接跳过(模拟 JS 在 block 读取中提前消耗关闭标签)
133		if skipped_closing_starts.contains(&full.start()) {
134			continue;
135		}
136		let leading_slash = m.get(1).map(|c| c.as_str()).unwrap_or("");
137		let mut tag_name = m.get(2).map(|c| c.as_str()).unwrap_or("").to_string();
138		let is_comment = full.as_str().starts_with("<!--");
139		let raw_attr_part = m.name("attrs").map(|c| c.as_str()).unwrap_or("");
140		let closing_slash_cap = m.get(4).map(|c| c.as_str()).unwrap_or("");
141		// 为了贴合 JS 行为:kMarkupPattern 中 attributes 与末尾的可选 "/" 分离;
142		// 我们的正则会把末尾的自闭合斜杠吞进 attrs(因为 / 不被排除)。
143		// 修正策略:从右向左扫描,若发现未在引号内的末尾 '/',则把它视作自闭合标记并剥离。
144		fn strip_trailing_self_close(s: &str) -> (String, bool) {
145			let mut in_single = false;
146			let mut in_double = false;
147			// 反向扫描需要掌握每个字符的引号状态;简单做法:正向一次记录状态,然后再反向索引。
148			let chars: Vec<char> = s.chars().collect();
149			let mut quote_state: Vec<(bool, bool)> = Vec::with_capacity(chars.len());
150			for &ch in &chars {
151				match ch {
152					'"' if !in_single => in_double = !in_double,
153					'\'' if !in_double => in_single = !in_single,
154					_ => {}
155				}
156				quote_state.push((in_single, in_double));
157			}
158			// 反向跳过空白
159			let mut idx = chars.len();
160			while idx > 0 && chars[idx - 1].is_whitespace() {
161				idx -= 1;
162			}
163			if idx > 0 && chars[idx - 1] == '/' {
164				let (s_in, d_in) = quote_state[idx - 1];
165				if !s_in && !d_in {
166					// 确认不在引号内
167					// 再检查前一个非空白字符是否是 '='(防止把值里的孤立 / 当成标记,但 alt="Path/"/> 情况最后一个 / 属于值内部,后面还有 '"' 再有 '/' self-close)
168					// 在 alt="Path/"/> 中 attr_part 末尾会是 alt="Path/"/ :倒数第二个字符是 '"'
169					// 此时 idx-2 是 '"', 再往前是 'h'. 可以安全剥离。
170					let cleaned = chars[..idx - 1].iter().collect();
171					return (cleaned, true);
172				}
173			}
174			(s.to_string(), false)
175		}
176		let (attr_part, trailing_self_close) = strip_trailing_self_close(raw_attr_part);
177		let match_start = full.start();
178		if match_start > last_text_pos {
179			let text = &data[last_text_pos..match_start];
180			if !text.is_empty() {
181				let top = stack.last_mut().unwrap();
182				let start_src = last_text_pos.saturating_sub(frame_offset);
183				let end_src = match_start.saturating_sub(frame_offset);
184				top.elem.children.push(Node::Text(TextNode::with_range(
185					text.to_string(),
186					start_src,
187					end_src,
188				)));
189			}
190		}
191		last_text_pos = full.end();
192		if is_comment {
193			if opts.comment {
194				let inner = full
195					.as_str()
196					.trim_start_matches("<!--")
197					.trim_end_matches("-->");
198				let top = stack.last_mut().unwrap();
199				// 记录注释在原输入中的范围(包含 <!-- --> 符号),与元素/文本一样使用 (start,end) 半开区间
200				let start_src = match_start.saturating_sub(frame_offset);
201				let end_src = full.end().saturating_sub(frame_offset);
202				top.elem
203					.children
204					.push(Node::Comment(CommentNode::with_range(
205						inner.to_string(),
206						start_src,
207						end_src,
208					)));
209			}
210			continue;
211		}
212		if tag_name == frameflag {
213			continue;
214		}
215		if opts.lower_case_tag_name {
216			tag_name = tag_name.to_lowercase();
217		}
218		let mut self_closing = !closing_slash_cap.is_empty()
219			|| trailing_self_close
220			|| attr_part.trim_end().ends_with('/') // 保险:兼容旧逻辑(理论上已剥离)
221			|| void_tag.is_void(&tag_name);
222
223		if leading_slash.is_empty() {
224			// opening tag
225			// auto close logic (original behavior: just pop parent when needed)
226			if let Some(parent) = stack.last() {
227				if let Some(list) = closed_by_open.get(parent.elem.name()) {
228					if list
229						.iter()
230						.any(|t| **t == tag_name || *t == tag_name.to_uppercase())
231					{
232						if stack.len() > 1 {
233							let closed = stack.pop().unwrap();
234							stack
235								.last_mut()
236								.unwrap()
237								.elem
238								.children
239								.push(Node::Element(closed.elem));
240						}
241					}
242				}
243			}
244			// fix nested A tags
245			if opts.fix_nested_a_tags && (tag_name.eq("a") || tag_name.eq("A")) {
246				if let Some(idx) = no_nested_a_index {
247					while stack.len() > idx {
248						let closed = stack.pop().unwrap();
249						stack
250							.last_mut()
251							.unwrap()
252							.elem
253							.children
254							.push(Node::Element(closed.elem));
255					}
256				}
257				no_nested_a_index = Some(stack.len());
258			}
259
260			// 初始阶段仅收集 id 和 class,其余延迟解析(更贴近 JS base_parse 行为)
261			let mut attrs: Vec<(String, String)> = Vec::new();
262			let mut saw_other_attr = false;
263			for cap in attr_key_re.captures_iter(&attr_part) {
264				let k = cap.get(1).unwrap().as_str();
265				let v_raw_opt = cap.get(2).or(cap.get(3)).or(cap.get(4));
266				let raw_v = v_raw_opt.map(|v| v.as_str()).unwrap_or("");
267				let unquoted = if raw_v.starts_with('"') || raw_v.starts_with('\'') {
268					raw_v.trim_matches(['"', '\''])
269				} else {
270					raw_v
271				};
272				let lk = k.to_lowercase();
273				if lk == "id" || lk == "class" {
274					attrs.push((lk, html_escape::decode_html_entities(unquoted).to_string()));
275				} else {
276					saw_other_attr = true;
277				}
278			}
279			let raw_attr_string = if attr_part.starts_with(' ') {
280				attr_part[1..].to_string()
281			} else {
282				attr_part.to_string()
283			};
284			let mut elem = Box::new(HTMLElement::new(
285				Some(tag_name.clone()),
286				raw_attr_string,
287				attrs,
288				self_closing && void_tag.is_void(&tag_name),
289				opts.void_tag.add_closing_slash,
290			));
291			// range 将在真正闭合或自闭合时由 set_range_start / set_range_end 赋值
292			// 记录解析选项供后续 set_content 继承
293			elem.parse_comment = opts.comment;
294			elem.parse_lowercase = opts.lower_case_tag_name;
295			if saw_other_attr {
296				elem.attrs_complete = false;
297			}
298			let open_start = match_start.saturating_sub(frame_offset);
299			let open_end = full.end().saturating_sub(frame_offset);
300			// provisional start
301			elem.set_range_start(open_start);
302			if self_closing {
303				// for self-closing tag finalize range
304				elem.set_range_end(open_end);
305			}
306			// 解析阶段填充 id 字段,便于后续 structure / closest 等直接使用(与 JS keyAttrs.id 行为对齐)
307			if let Some((_, v)) = elem.attrs.iter().find(|(k, _)| k == "id") {
308				elem.id = v.clone();
309			}
310
311			// block text element handling: capture inner text verbatim until closing tag
312			let lower_tag = tag_name.to_lowercase();
313			if let Some(extract) = opts.block_text_elements.get(&lower_tag) {
314				// 查找对应关闭标签(大小写不敏感,参考 JS 逻辑)
315				let close_markup = format!("</{}>", tag_name);
316				let search_slice = &data[last_text_pos..];
317				let lower_search = search_slice.to_lowercase();
318				let lower_close = close_markup.to_lowercase();
319				if let Some(rel) = lower_search.find(&lower_close) {
320					let close_start = last_text_pos + rel; // 关闭标签 '<' 起始位置
321					let suppress = opts.suppress_script_style_text
322						&& (lower_tag == "script" || lower_tag == "style");
323					if *extract && !suppress {
324						// true -> 提取文本(除非被全局抑制)
325						let inner_text = &data[last_text_pos..close_start];
326						if !inner_text.is_empty() {
327							let inner_start = last_text_pos.saturating_sub(frame_offset);
328							let inner_end = close_start.saturating_sub(frame_offset);
329							elem.children.push(Node::Text(TextNode::with_range(
330								inner_text.to_string(),
331								inner_start,
332								inner_end,
333							)));
334						}
335					}
336					// 无论是否提取文本,都跳过关闭标签匹配:记录其起始位置,供主循环跳过
337					skipped_closing_starts.insert(close_start);
338					last_text_pos = close_start + close_markup.len();
339					let close_end_src =
340						(close_start + close_markup.len()).saturating_sub(frame_offset);
341					elem.set_range_end(close_end_src);
342					self_closing = true; // 强制立即闭合
343				} else {
344					// 未找到关闭标签:与 JS 一致,标记 last_text_pos 到末尾避免后续文本节点重复
345					last_text_pos = data.len() + 1;
346				}
347			}
348			// 若因 block_textElements 强制 self_closing,将 elem.children 保持现状并不入栈
349
350			if self_closing {
351				let parent = stack.last_mut().unwrap();
352				let parent_ptr: *mut HTMLElement = &mut *parent.elem;
353				elem.parent = Some(parent_ptr);
354				parent.elem.children.push(Node::Element(elem));
355			} else {
356				let parent_ptr: *mut HTMLElement = &mut *stack.last_mut().unwrap().elem;
357				elem.parent = Some(parent_ptr);
358				stack.push(StackEntry { elem });
359			}
360		} else {
361			// closing tag
362			// remove nested a index if closing A
363			if no_nested_a_index.is_some() && (tag_name.eq("a") || tag_name.eq("A")) {
364				no_nested_a_index = None;
365			}
366			let target = tag_name.to_lowercase();
367			// try to find matching open
368			let mut i = stack.len();
369			while i > 1 {
370				// skip root at 0
371				i -= 1;
372				if stack[i].elem.name().eq_ignore_ascii_case(&target) {
373					while stack.len() > i + 1 {
374						let closed = stack.pop().unwrap();
375						let parent = stack.last_mut().unwrap();
376						let mut e = closed.elem;
377						let parent_ptr: *mut HTMLElement = &mut *parent.elem;
378						e.parent = Some(parent_ptr);
379						let close_end = full.end().saturating_sub(frame_offset);
380						if e.range().is_some() {
381							e.set_range_end(close_end);
382						}
383						parent.elem.children.push(Node::Element(e));
384					}
385					let closed = stack.pop().unwrap();
386					let parent = stack.last_mut().unwrap();
387					let mut e = closed.elem;
388					let parent_ptr: *mut HTMLElement = &mut *parent.elem;
389					e.parent = Some(parent_ptr);
390					let close_end_main = full.end().saturating_sub(frame_offset);
391					if e.range().is_some() {
392						e.set_range_end(close_end_main);
393					}
394					parent.elem.children.push(Node::Element(e));
395					break;
396				} else {
397					// aggressive strategy: if parent would be auto-closed by this closing tag
398					let parent_name = stack[i].elem.name().to_lowercase();
399					if let Some(list) = closed_by_close.get(parent_name.as_str()) {
400						if list.iter().any(|x| x.eq_ignore_ascii_case(&tag_name)) {
401							let closed = stack.pop().unwrap();
402							let parent = stack.last_mut().unwrap();
403							let mut e = closed.elem;
404							let parent_ptr: *mut HTMLElement = &mut *parent.elem;
405							e.parent = Some(parent_ptr);
406							parent.elem.children.push(Node::Element(e));
407							continue;
408						}
409					}
410				}
411			}
412		}
413	}
414	// trailing text if any
415	if last_text_pos < data.len() {
416		let text = &data[last_text_pos..];
417		if !text.is_empty() {
418			let top = stack.last_mut().unwrap();
419			let start_src = last_text_pos.saturating_sub(frame_offset);
420			let end_src = data.len().saturating_sub(frame_offset);
421			top.elem.children.push(Node::Text(TextNode::with_range(
422				text.to_string(),
423				start_src,
424				end_src,
425			)));
426		}
427	}
428	// unwind unless parse_none_closed_tags keeps them as-is
429	if opts.parse_none_closed_tags {
430		// 保持原样直接线性挂接
431		while stack.len() > 1 {
432			let closed = stack.pop().unwrap();
433			let parent = stack.last_mut().unwrap();
434			let mut e = closed.elem;
435			let parent_ptr: *mut HTMLElement = &mut *parent.elem;
436			e.parent = Some(parent_ptr);
437			parent.elem.children.push(Node::Element(e));
438		}
439		let root = stack.pop().unwrap().elem;
440		return root;
441	}
442	// JS parse() 错误修复阶段:处理 pair error 与 single error
443	// stack[0] 为 root, 其余为未闭合链。模拟 JS: while stack.length > 1 { let last = pop(); let oneBefore = back(); ... }
444	while stack.len() > 1 {
445		let last = stack.pop().unwrap();
446		let one_before = stack.last_mut().unwrap();
447		let mut last_elem = last.elem; // detached error element
448								 // 简化 pair error 逻辑:若相邻未闭合标签标签名相同,视为 pair(与 JS 修复阶段行为保持)
449		let is_pair = last_elem.name() == one_before.elem.name();
450		if is_pair {
451			// pair error 修复:移除 last,将其子节点上提到 oneBefore 的父级
452			// 这里简化:直接附加到 one_before.elem 的父(若存在)
453			// pair 修复:移除 one_before 的重复子元素,将 last 的子节点上移到 one_before 的父级
454			if let Some(one_parent_ptr) = one_before.elem.parent {
455				unsafe {
456					let one_parent = &mut *one_parent_ptr;
457					one_parent.remove_children_where(
458						|n| matches!(n, Node::Element(e) if e.name()==last_elem.name()),
459					);
460					for mut child in last_elem.children.drain(..) {
461						if let Node::Element(e) = &mut child {
462							let parent_ptr: *mut HTMLElement = one_parent_ptr;
463							e.parent = Some(parent_ptr);
464						}
465						one_parent.children.push(child);
466					}
467				}
468			}
469			continue;
470		}
471		// single error: remove last but keep its children inside one_before
472		// 但若这是文件尾部未闭合的正常标签(比如 <div><p>abc),应保留 last_elem 并设置其 range 结束到末尾
473		let end_fix_needed =
474			last_elem.range().is_some() && last_elem.range().unwrap().1 < input.len();
475		if end_fix_needed {
476			// finalize range end to input end
477			last_elem.set_range_end(input.len());
478			// 挂回 one_before
479			let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
480			last_elem.parent = Some(parent_ptr);
481			one_before.elem.children.push(Node::Element(last_elem));
482			continue;
483		} else {
484			let target_name = last_elem.name().to_string();
485			one_before
486				.elem
487				.remove_children_where(|n| matches!(n, Node::Element(e) if e.name()==target_name));
488			for mut child in last_elem.children.drain(..) {
489				match &mut child {
490					Node::Element(e) => {
491						let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
492						e.parent = Some(parent_ptr);
493					}
494					_ => {}
495				}
496				one_before.elem.children.push(child);
497			}
498		}
499	}
500	// 最终收束:stack 剩 root
501	let root = stack.pop().unwrap().elem;
502	// 后处理:相邻重复 heading(h1-h6)时,将后者子节点(非空内容)提升为前者之后的兄弟,并移除后者。
503	// 目的:复刻 JS parse() 在修复相邻或嵌套 heading 过程中产生的展平效果(见 tests 中 h3 链相关用例)。
504	fn promote_heading_duplicates(node: &mut HTMLElement) {
505		use crate::dom::node::Node;
506		let mut i = 0;
507		while i + 1 < node.children.len() {
508			let promote = match (&node.children[i], &node.children[i + 1]) {
509				(Node::Element(a), Node::Element(b)) => {
510					let n1 = a.name();
511					let n2 = b.name();
512					if n1 == n2 && matches!(n1, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
513						Some((n1.to_string(), i + 1))
514					} else {
515						None
516					}
517				}
518				_ => None,
519			};
520			// 如果当前是 heading,后面紧跟不同标签且当前 heading 没有结束 range,补一个空结束(使序列化产生 </h3>)
521			if promote.is_none() && i + 1 < node.children.len() {
522				// safe split to avoid aliasing mutable/immutable borrows
523				let (left, right) = node.children.split_at_mut(i + 1); // left has up to i
524				if let Some(Node::Element(h)) = left.last_mut() {
525					if let Some(Node::Element(next_el)) = right.first() {
526						let hn = h.name();
527						if matches!(hn, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
528							&& hn != next_el.name()
529						{
530							if let Some(r) = h.range() {
531								if r.0 == r.1 {
532									h.set_range_end(r.0);
533								}
534							}
535						}
536					}
537				}
538			}
539			if let Some((_name, dup_idx)) = promote {
540				// 取出重复 heading
541				let mut dup = match node.children.remove(dup_idx) {
542					Node::Element(e) => e,
543					_ => unreachable!(),
544				};
545				// 过滤并提升其子节点:丢弃空元素(无属性 & 无子节点)
546				let insertion_pos = i + 1; // 在原第一 heading 之后依次插入
547				let mut promoted: Vec<Node> = Vec::new();
548				for child in dup.children.drain(..) {
549					let keep = match &child {
550						Node::Element(e) => {
551							let name = e.name();
552							// 仅过滤“空 div” (无属性且无子节点);其它标签即使空也保留(如 span)以匹配 js 行为
553							if name == "div" && e.raw_attrs.is_empty() && e.children.is_empty() {
554								false
555							} else {
556								true
557							}
558						}
559						_ => true,
560					};
561					if keep {
562						promoted.push(child);
563					}
564				}
565				for (offset, mut ch) in promoted.into_iter().enumerate() {
566					if let Node::Element(ref mut e) = ch {
567						e.parent = Some(node as *mut HTMLElement);
568					}
569					node.children.insert(insertion_pos + offset, ch);
570				}
571				// 若第一个 heading 仍为空且 range 未闭合 (start==end),为使序列化输出闭合标签,模拟设置结束位置
572				if let Node::Element(first_h) = &mut node.children[i] {
573					if matches!(first_h.name(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
574						if let Some((s, e)) = first_h.range() {
575							if s == e {
576								first_h.set_range_end(s + 1);
577							}
578						}
579					}
580				}
581				// 不递增 i,重新检查当前位置后续可能还有重复
582				continue;
583			}
584			i += 1;
585		}
586		// 递归
587		for child in node.children.iter_mut() {
588			if let Node::Element(e) = child {
589				promote_heading_duplicates(e);
590			}
591		}
592	}
593	let mut root_box = root;
594	promote_heading_duplicates(&mut root_box);
595	root_box
596}
597
598/// 验证HTML是否有效
599///
600/// 解析HTML并检查栈长度是否为1,类似于js/node-html-parser中的valid函数
601///
602/// # 参数
603///
604/// * `input` - 要解析的HTML字符串
605/// * `opts` - 解析选项
606///
607/// # 返回值
608///
609/// 如果HTML有效(即栈长度为1)返回true,否则返回false
610pub fn valid(input: &str, opts: &Options) -> bool {
611	// 依据 JS 版本:仅看解析后栈是否仅包含 root
612	// 这里实现一个轻量 base_parse,与上方 parse 主逻辑分离,尽量复刻 js/node-html-parser/nodes/html.ts 中 base_parse 的栈规则
613
614	// void & frameflag 包装
615	const FRAMEFLAG: &str = "documentfragmentcontainer";
616	let data = format!("<{}>{}</{}>", FRAMEFLAG, input, FRAMEFLAG);
617	// data_end_virtual 未使用,移除以消除编译警告
618
619	let void_tag = VoidTag::new(&opts.void_tag);
620	// 与 parse_with_options 同步的(精简)Unicode tag name 支持:
621	let tag_re =
622		Regex::new(r"<!--[\s\S]*?-->|<(\/)?([A-Za-z][-.:0-9_A-Za-z@\p{L}\p{M}]*)([^>]*)>").unwrap();
623
624	// 对齐 JS 关闭规则映射
625	use std::collections::HashMap; // 局部使用,避免与上面 parse 冲突
626	let mut closed_by_open: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
627	macro_rules! c_open {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_open.insert($p,m.clone()); closed_by_open.insert($p.to_uppercase().leak(),m);}}}
628	c_open!("li", ["li", "LI"]);
629	c_open!("p", ["p", "P", "div", "DIV"]);
630	c_open!("b", ["div", "DIV"]);
631	c_open!("td", ["td", "th", "TD", "TH"]);
632	c_open!("th", ["td", "th", "TD", "TH"]);
633	c_open!("h1", ["h1", "H1"]);
634	c_open!("h2", ["h2", "H2"]);
635	c_open!("h3", ["h3", "H3"]);
636	c_open!("h4", ["h4", "H4"]);
637	c_open!("h5", ["h5", "H5"]);
638	c_open!("h6", ["h6", "H6"]);
639
640	let mut closed_by_close: HashMap<&'static str, HashMap<&'static str, bool>> = HashMap::new();
641	macro_rules! c_close {($p:literal, [$($c:literal),*]) => {{ let mut m=HashMap::new(); $(m.insert($c,true);)* closed_by_close.insert($p,m.clone()); closed_by_close.insert($p.to_uppercase().leak(),m);}}}
642	c_close!("li", ["ul", "ol", "UL", "OL"]);
643	c_close!("a", ["div", "DIV"]);
644	c_close!("b", ["div", "DIV"]);
645	c_close!("i", ["div", "DIV"]);
646	c_close!("p", ["div", "DIV"]);
647	c_close!("td", ["tr", "table", "TR", "TABLE"]);
648	c_close!("th", ["tr", "table", "TR", "TABLE"]);
649
650	#[derive(Clone)]
651	struct SimpleEl {
652		raw: String,
653	}
654	let mut stack: Vec<SimpleEl> = vec![SimpleEl {
655		raw: "#root".into(),
656	}];
657	// block text elements: 其内部视为原始文本,不再解析标签(与 JS 一致)
658	let mut pos = 0usize;
659	let block_text: std::collections::HashSet<&'static str> =
660		["script", "style", "pre", "noscript"].into_iter().collect();
661	while let Some(m) = tag_re.find_at(&data, pos) {
662		let full = m.as_str();
663		pos = m.end();
664		if full.starts_with("<!--") {
665			continue;
666		}
667		let caps = tag_re.captures(full).unwrap();
668		let leading_slash = caps.get(1).map(|c| c.as_str()).unwrap_or("");
669		let tag_name_raw = caps.get(2).map(|c| c.as_str()).unwrap_or("");
670		let mut tag_name = tag_name_raw.to_string();
671		let tag_name_lc = tag_name_raw.to_ascii_lowercase();
672		if opts.lower_case_tag_name {
673			tag_name = tag_name_lc.clone();
674		}
675		if tag_name_lc == FRAMEFLAG {
676			continue;
677		}
678		let attr_part = caps.get(3).map(|c| c.as_str()).unwrap_or("");
679		// 兼容 issue_227 与 frameflag 包装:若错误地把 frameflag 的关闭标签吞进属性(出现 '</documentfragmentcontainer'),说明这是跨越边界的伪匹配,跳过
680		if leading_slash.is_empty() && attr_part.contains("</documentfragmentcontainer") {
681			continue;
682		}
683		let self_close = attr_part.trim_end().ends_with('/') || void_tag.is_void(&tag_name_lc);
684		if leading_slash.is_empty() {
685			// opening tag
686			if let Some(parent) = stack.last() {
687				if let Some(map) = closed_by_open.get(parent.raw.as_str()) {
688					if map.contains_key(tag_name.as_str()) {
689						stack.pop();
690					}
691				}
692			}
693			if !self_close && !void_tag.is_void(&tag_name_lc) {
694				let is_block = block_text.contains(tag_name_lc.as_str());
695				stack.push(SimpleEl {
696					raw: if opts.lower_case_tag_name {
697						tag_name_lc.clone()
698					} else {
699						tag_name.clone()
700					},
701				});
702				if is_block {
703					// 查找对应关闭标签(大小写不敏感)
704					let close_pat = format!("</{}>", tag_name_lc);
705					// 为避免多次分配,做一次 to_lowercase() 子串搜索
706					if let Some(rel_idx) = data[pos..].to_ascii_lowercase().find(&close_pat) {
707						let close_start = pos + rel_idx; // '<' of closing
708									   // 找到 '>'
709						if let Some(gt_rel) = data[close_start..].find('>') {
710							// pop block 元素(模拟遇到关闭标签)
711							stack.pop();
712							pos = close_start + gt_rel + 1; // 跳过整个关闭标签
713							continue; // 继续下一轮 find_at
714						} else {
715							// 没有 '>',视为未闭合,结束
716							break;
717						}
718					} else {
719						// 未找到关闭,视为未闭合 => 结束
720						break;
721					}
722				}
723			}
724		} else {
725			let target_lc = tag_name_lc.as_str();
726			// closing tag
727			loop {
728				if let Some(top) = stack.last() {
729					if top.raw.eq(target_lc) || top.raw.eq(tag_name.as_str()) {
730						stack.pop();
731						break;
732					}
733					if let Some(map) = closed_by_close.get(top.raw.as_str()) {
734						if map.contains_key(tag_name.as_str()) || map.contains_key(target_lc) {
735							stack.pop();
736							continue;
737						}
738					}
739				}
740				break;
741			}
742		}
743	}
744	// JS: stack length==1 表明完整闭合
745	let ok = stack.len() == 1;
746	ok
747}