Skip to main content

node_html_parser/parser/
core_parser.rs

1//! Core HTML parsing engine with optimized zero-copy implementation.
2
3use crate::dom::comment::CommentNode;
4use crate::dom::element::HTMLElement;
5use crate::dom::node::Node;
6use crate::dom::text::TextNode;
7use crate::dom::void_tag::VoidTag;
8use regex::Regex;
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12use super::attrs::parse_id_class_attrs_fast;
13use super::fast_parser::parse_tags_zero_copy;
14use super::types::{Options, StackEntry};
15use super::utils::{find_closing_tag_case_insensitive, strip_trailing_self_close_optimized};
16
17// 缓存编译好的正则表达式以避免重复编译
18static TAG_REGEX: OnceLock<Regex> = OnceLock::new();
19static ATTR_KEY_REGEX: OnceLock<Regex> = OnceLock::new();
20
21pub fn parse_with_options(input: &str, opts: &Options) -> Box<HTMLElement> {
22	// 改进:
23	// 1) 支持引号内含 '>'(通过 attrs 子模式保证只有引号外的 '>' 终止标签)
24	// 2) 扩展 tagName 的 Unicode 范围以对齐 JS 版本 kMarkupPattern,使自定义标签(含中文、阿拉伯等 BMP 范围字符)通过。
25	//    JS 的模式包含大量 Unicode 范围及高位平面;Rust regex 不支持 >BMP 直接类,这里截取到 BMP(满足当前测试需求)。
26	// 缓存编译好的正则表达式以避免重复编译(预留用于其他功能)
27	let _tag_re = TAG_REGEX.get_or_init(|| {
28		Regex::new(r#"<!--(?s:.*?)-->|<(\/)?([A-Za-z][A-Za-z0-9._:@\-\p{L}\p{M}]*)(?P<attrs>(?:[^>"']|"[^"]*"|'[^']*')*)(/?)>"#).unwrap()
29	});
30	let _attr_key_re = ATTR_KEY_REGEX.get_or_init(|| {
31		Regex::new(
32			r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|[^\s>]+))?"#,
33		)
34		.unwrap()
35	});
36	let void_tag = VoidTag::new(&opts.void_tag);
37
38	// Elements closed by opening of specific following tags (subset of JS map)
39	let mut closed_by_open: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
40	// helper: insert mapping for both lower & upper parent names (mirror JS maps having LI & li etc.)
41	macro_rules! ins_open {
42		($parent:expr, [ $($v:expr),* ]) => {{
43			closed_by_open.insert($parent, vec![$($v),*]);
44			let up = $parent.to_uppercase();
45			if up != $parent { closed_by_open.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
46		}};
47	}
48	ins_open!("li", ["li", "LI"]);
49	ins_open!("p", ["p", "P", "div", "DIV"]);
50	ins_open!("b", ["div", "DIV"]);
51	ins_open!("td", ["td", "th", "TD", "TH"]);
52	ins_open!("th", ["td", "th", "TD", "TH"]);
53	ins_open!("h1", ["h1", "H1"]);
54	ins_open!("h2", ["h2", "H2"]);
55	ins_open!("h3", ["h3", "H3"]);
56	ins_open!("h4", ["h4", "H4"]);
57	ins_open!("h5", ["h5", "H5"]);
58	ins_open!("h6", ["h6", "H6"]);
59
60	let mut closed_by_close: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
61	macro_rules! ins_close {
62		($parent:expr, [ $($v:expr),* ]) => {{
63			closed_by_close.insert($parent, vec![$($v),*]);
64			let up = $parent.to_uppercase();
65			if up != $parent { closed_by_close.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
66		}};
67	}
68	ins_close!("li", ["ul", "UL", "ol", "OL"]);
69	ins_close!("a", ["div", "DIV"]);
70	ins_close!("b", ["div", "DIV"]);
71	ins_close!("i", ["div", "DIV"]);
72	ins_close!("p", ["div", "DIV"]);
73	ins_close!("td", ["tr", "TR", "table", "TABLE"]);
74	ins_close!("th", ["tr", "TR", "table", "TABLE"]);
75
76	let root = Box::new(HTMLElement::new(
77		None,
78		String::new(),
79		Vec::new(),
80		false,
81		opts.void_tag.add_closing_slash,
82	));
83	// 🚀 优化:预分配栈容量,减少深层嵌套时的重新分配
84	let mut stack: Vec<StackEntry> = Vec::with_capacity(32);
85	stack.push(StackEntry { elem: root });
86	// Safe mutable access to set options flags on root
87	if let Some(first) = stack.last_mut() {
88		first.elem.parse_comment = opts.comment;
89		first.elem.parse_lowercase = opts.lower_case_tag_name;
90	}
91	let mut no_nested_a_index: Option<usize> = None;
92
93	let frameflag = "documentfragmentcontainer";
94	let frame_prefix = format!("<{}>", frameflag);
95	let frame_suffix = format!("</{}>", frameflag);
96	let data = format!("{}{}{}", frame_prefix, input, frame_suffix);
97	let frame_offset = frame_prefix.len();
98
99	// 🔥 使用零拷贝手写解析器:消除83%字符串分配开销,提升性能
100	let tag_matches = parse_tags_zero_copy(&data);
101	let mut match_index = 0;
102	let mut last_text_pos = 0usize;
103
104	// 记录在 block 文本处理阶段被直接消耗的关闭标签位置
105	let mut skipped_closing_starts: std::collections::HashSet<usize> =
106		std::collections::HashSet::new();
107
108	// 遍历所有解析出的标签
109	while match_index < tag_matches.len() {
110		let tag_match = &tag_matches[match_index];
111		match_index += 1;
112
113		// 若该匹配为已消耗的 blockTextElements 的关闭标签,直接跳过
114		if skipped_closing_starts.contains(&tag_match.start) {
115			continue;
116		}
117
118		let match_start = tag_match.start;
119		let is_comment = tag_match.is_comment;
120		let _leading_slash = if tag_match.is_closing { "/" } else { "" };
121		let tag_name = tag_match.tag_name.to_string(); // 零拷贝版本:从&str转为String
122		let raw_attr_part = &tag_match.attrs;
123		let _trailing_self_close = tag_match.self_closing;
124		// 🚀 优化:使用预计算的自闭合标记检测
125		let (attr_part, trailing_self_close) = strip_trailing_self_close_optimized(raw_attr_part);
126		// 🚀 优化:将文本内容处理延迟,使用切片而不是立即克隆
127		if match_start > last_text_pos {
128			let text_slice = &data[last_text_pos..match_start];
129			if !text_slice.is_empty() {
130				let top = stack.last_mut().unwrap();
131				let start_src = last_text_pos.saturating_sub(frame_offset);
132				let end_src = match_start.saturating_sub(frame_offset);
133				// 只在实际需要时才分配字符串
134				top.elem.children.push(Node::Text(TextNode::with_range(
135					text_slice.to_string(), // 仍需分配,但至少减少了中间变量
136					start_src,
137					end_src,
138				)));
139			}
140		}
141		last_text_pos = tag_match.end;
142		if is_comment {
143			if opts.comment {
144				// 从完整的data中提取注释内容
145				let full_comment = &data[tag_match.start..tag_match.end];
146				let inner = full_comment
147					.trim_start_matches("<!--")
148					.trim_end_matches("-->");
149				let top = stack.last_mut().unwrap();
150				// 记录注释在原输入中的范围(包含 <!-- --> 符号),与元素/文本一样使用 (start,end) 半开区间
151				let start_src = match_start.saturating_sub(frame_offset);
152				let end_src = tag_match.end.saturating_sub(frame_offset);
153				top.elem
154					.children
155					.push(Node::Comment(CommentNode::with_range(
156						inner.to_string(),
157						start_src,
158						end_src,
159					)));
160			}
161			continue;
162		}
163		if tag_name == frameflag {
164			continue;
165		}
166
167		// 🔥 优化:统一计算小写版本,避免重复转换
168		let lower_tag_name = tag_name.to_lowercase(); // 零拷贝版本需要转换为String用于比较
169
170		let final_tag_name = if opts.lower_case_tag_name {
171			&lower_tag_name
172		} else {
173			&tag_name
174		};
175
176		let mut self_closing = trailing_self_close
177			|| attr_part.trim_end().ends_with('/') // 保险:兼容旧逻辑(理论上已剥离)
178            || void_tag.is_void(final_tag_name);
179
180		if !tag_match.is_closing {
181			// opening tag
182			// auto close logic (original behavior: just pop parent when needed)
183			if !opts.preserve_tag_nesting {
184				if let Some(parent) = stack.last() {
185					if let Some(list) = closed_by_open.get(parent.elem.name()) {
186						if list
187							.iter()
188							.any(|t| **t == tag_name || *t == tag_name.to_uppercase())
189						{
190							if stack.len() > 1 {
191								let closed = stack.pop().unwrap();
192								stack
193									.last_mut()
194									.unwrap()
195									.elem
196									.children
197									.push(Node::Element(closed.elem));
198							}
199						}
200					}
201				}
202			}
203			// fix nested A tags
204			if opts.fix_nested_a_tags && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
205				if let Some(idx) = no_nested_a_index {
206					while stack.len() > idx {
207						let closed = stack.pop().unwrap();
208						stack
209							.last_mut()
210							.unwrap()
211							.elem
212							.children
213							.push(Node::Element(closed.elem));
214					}
215				}
216				no_nested_a_index = Some(stack.len());
217			}
218
219			// 🚀 优化:使用高效的手写属性解析器替代正则表达式
220			let (attrs, saw_other_attr) = parse_id_class_attrs_fast(&attr_part);
221			// 🚀 优化:避免不必要的字符串克隆
222			let raw_attr_string = if attr_part.starts_with(' ') && attr_part.len() > 1 {
223				attr_part[1..].to_string()
224			} else {
225				attr_part.to_string()
226			};
227			let mut elem = Box::new(HTMLElement::new(
228				Some(final_tag_name.to_string()),
229				raw_attr_string,
230				attrs,
231				self_closing && void_tag.is_void(final_tag_name),
232				opts.void_tag.add_closing_slash,
233			));
234			// range 将在真正闭合或自闭合时由 set_range_start / set_range_end 赋值
235			// 记录解析选项供后续 set_content 继承
236			elem.parse_comment = opts.comment;
237			elem.parse_lowercase = opts.lower_case_tag_name;
238			if saw_other_attr {
239				elem.attrs_complete = false;
240			}
241			let open_start = match_start.saturating_sub(frame_offset);
242			let open_end = tag_match.end.saturating_sub(frame_offset);
243			// provisional start
244			elem.set_range_start(open_start);
245			if self_closing {
246				// for self-closing tag finalize range
247				elem.set_range_end(open_end);
248			}
249			// 解析阶段填充 id 字段,便于后续 structure / closest 等直接使用(与 JS keyAttrs.id 行为对齐)
250			if let Some((_, v)) = elem.attrs.iter().find(|(k, _)| k == "id") {
251				elem.id = v.clone();
252			}
253
254			// block text element handling: capture inner text verbatim until closing tag
255			if let Some(extract) = opts.block_text_elements.get(&lower_tag_name) {
256				// 查找对应关闭标签(大小写不敏感,参考 JS 逻辑)
257				let close_markup = format!("</{}>", final_tag_name);
258				let search_slice = &data[last_text_pos..];
259
260				// 🚀 优化:使用高效的大小写不敏感搜索,避免创建整个文档的小写副本
261				if let Some(rel) = find_closing_tag_case_insensitive(search_slice, &close_markup) {
262					let close_start = last_text_pos + rel; // 关闭标签 '<' 起始位置
263					let suppress = opts.suppress_script_style_text
264						&& (lower_tag_name == "script" || lower_tag_name == "style");
265					if *extract && !suppress {
266						// true -> 提取文本(除非被全局抑制)
267						let inner_text = &data[last_text_pos..close_start];
268						if !inner_text.is_empty() {
269							let inner_start = last_text_pos.saturating_sub(frame_offset);
270							let inner_end = close_start.saturating_sub(frame_offset);
271							elem.children.push(Node::Text(TextNode::with_range(
272								inner_text.to_string(),
273								inner_start,
274								inner_end,
275							)));
276						}
277					}
278					// 无论是否提取文本,都跳过关闭标签匹配:记录其起始位置,供主循环跳过
279					skipped_closing_starts.insert(close_start);
280					last_text_pos = close_start + close_markup.len();
281					let close_end_src =
282						(close_start + close_markup.len()).saturating_sub(frame_offset);
283					elem.set_range_end(close_end_src);
284					self_closing = true; // 强制立即闭合
285				} else {
286					// 未找到关闭标签:与 JS 一致,标记 last_text_pos 到末尾避免后续文本节点重复
287					last_text_pos = data.len() + 1;
288				}
289			}
290			// 若因 block_textElements 强制 self_closing,将 elem.children 保持现状并不入栈
291
292			if self_closing {
293				let parent = stack.last_mut().unwrap();
294				let parent_ptr: *mut HTMLElement = &mut *parent.elem;
295				elem.parent = Some(parent_ptr);
296				parent.elem.children.push(Node::Element(elem));
297			} else {
298				let parent_ptr: *mut HTMLElement = &mut *stack.last_mut().unwrap().elem;
299				elem.parent = Some(parent_ptr);
300				stack.push(StackEntry { elem });
301			}
302		} else {
303			// closing tag
304			// remove nested a index if closing A
305			if no_nested_a_index.is_some() && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
306				no_nested_a_index = None;
307			}
308			// 🚀 优化:使用预计算的小写版本
309			let target = &lower_tag_name;
310			// try to find matching open
311			let mut i = stack.len();
312			while i > 1 {
313				// skip root at 0
314				i -= 1;
315				if stack[i].elem.name().eq_ignore_ascii_case(target) {
316					while stack.len() > i + 1 {
317						let closed = stack.pop().unwrap();
318						let parent = stack.last_mut().unwrap();
319						let mut e = closed.elem;
320						let parent_ptr: *mut HTMLElement = &mut *parent.elem;
321						e.parent = Some(parent_ptr);
322						let close_end = tag_match.end.saturating_sub(frame_offset);
323						if e.range().is_some() {
324							e.set_range_end(close_end);
325						}
326						parent.elem.children.push(Node::Element(e));
327					}
328					let closed = stack.pop().unwrap();
329					let parent = stack.last_mut().unwrap();
330					let mut e = closed.elem;
331					let parent_ptr: *mut HTMLElement = &mut *parent.elem;
332					e.parent = Some(parent_ptr);
333					let close_end_main = tag_match.end.saturating_sub(frame_offset);
334					if e.range().is_some() {
335						e.set_range_end(close_end_main);
336					}
337					parent.elem.children.push(Node::Element(e));
338					break;
339				} else {
340					// aggressive strategy: if parent would be auto-closed by this closing tag
341					let parent_name = stack[i].elem.name().to_lowercase();
342					if let Some(list) = closed_by_close.get(parent_name.as_str()) {
343						if list.iter().any(|x| x.eq_ignore_ascii_case(&tag_name)) {
344							let closed = stack.pop().unwrap();
345							let parent = stack.last_mut().unwrap();
346							let mut e = closed.elem;
347							let parent_ptr: *mut HTMLElement = &mut *parent.elem;
348							e.parent = Some(parent_ptr);
349							parent.elem.children.push(Node::Element(e));
350							continue;
351						}
352					}
353				}
354			}
355		}
356	}
357	// trailing text if any
358	if last_text_pos < data.len() {
359		let text = &data[last_text_pos..];
360		if !text.is_empty() {
361			let top = stack.last_mut().unwrap();
362			let start_src = last_text_pos.saturating_sub(frame_offset);
363			let end_src = data.len().saturating_sub(frame_offset);
364			top.elem.children.push(Node::Text(TextNode::with_range(
365				text.to_string(),
366				start_src,
367				end_src,
368			)));
369		}
370	}
371	// unwind unless parse_none_closed_tags keeps them as-is
372	if opts.parse_none_closed_tags {
373		// 保持原样直接线性挂接
374		while stack.len() > 1 {
375			let closed = stack.pop().unwrap();
376			let parent = stack.last_mut().unwrap();
377			let mut e = closed.elem;
378			let parent_ptr: *mut HTMLElement = &mut *parent.elem;
379			e.parent = Some(parent_ptr);
380			parent.elem.children.push(Node::Element(e));
381		}
382		let root = stack.pop().unwrap().elem;
383		return root;
384	}
385	// JS parse() 错误修复阶段:处理 pair error 与 single error
386	// stack[0] 为 root, 其余为未闭合链。模拟 JS: while stack.length > 1 { let last = pop(); let oneBefore = back(); ... }
387	while stack.len() > 1 {
388		let last = stack.pop().unwrap();
389		let one_before = stack.last_mut().unwrap();
390		let mut last_elem = last.elem; // detached error element
391								 // 简化 pair error 逻辑:若相邻未闭合标签标签名相同,视为 pair(与 JS 修复阶段行为保持)
392		let is_pair = last_elem.name() == one_before.elem.name();
393		if is_pair {
394			// pair error 修复:移除 last,将其子节点上提到 oneBefore 的父级
395			// 这里简化:直接附加到 one_before.elem 的父(若存在)
396			// pair 修复:移除 one_before 的重复子元素,将 last 的子节点上移到 one_before 的父级
397			if let Some(one_parent_ptr) = one_before.elem.parent {
398				unsafe {
399					let one_parent = &mut *one_parent_ptr;
400					one_parent.remove_children_where(
401						|n| matches!(n, Node::Element(e) if e.name()==last_elem.name()),
402					);
403					for mut child in last_elem.children.drain(..) {
404						if let Node::Element(e) = &mut child {
405							let parent_ptr: *mut HTMLElement = one_parent_ptr;
406							e.parent = Some(parent_ptr);
407						}
408						one_parent.children.push(child);
409					}
410				}
411			}
412			continue;
413		}
414		// single error: remove last but keep its children inside one_before
415		// 但若这是文件尾部未闭合的正常标签(比如 <div><p>abc),应保留 last_elem 并设置其 range 结束到末尾
416		let end_fix_needed =
417			last_elem.range().is_some() && last_elem.range().unwrap().1 < input.len();
418		if end_fix_needed {
419			// finalize range end to input end
420			last_elem.set_range_end(input.len());
421			// 挂回 one_before
422			let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
423			last_elem.parent = Some(parent_ptr);
424			one_before.elem.children.push(Node::Element(last_elem));
425			continue;
426		} else {
427			let target_name = last_elem.name().to_string();
428			one_before
429				.elem
430				.remove_children_where(|n| matches!(n, Node::Element(e) if e.name()==target_name));
431			for mut child in last_elem.children.drain(..) {
432				match &mut child {
433					Node::Element(e) => {
434						let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
435						e.parent = Some(parent_ptr);
436					}
437					_ => {}
438				}
439				one_before.elem.children.push(child);
440			}
441		}
442	}
443	// 最终收束:stack 剩 root
444	let root = stack.pop().unwrap().elem;
445	// 后处理:相邻重复 heading(h1-h6)时,将后者子节点(非空内容)提升为前者之后的兄弟,并移除后者。
446	// 目的:复刻 JS parse() 在修复相邻或嵌套 heading 过程中产生的展平效果(见 tests 中 h3 链相关用例)。
447	fn promote_heading_duplicates(node: &mut HTMLElement) {
448		use crate::dom::node::Node;
449		let mut i = 0;
450		while i + 1 < node.children.len() {
451			let promote = match (&node.children[i], &node.children[i + 1]) {
452				(Node::Element(a), Node::Element(b)) => {
453					let n1 = a.name();
454					let n2 = b.name();
455					if n1 == n2 && matches!(n1, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
456						Some((n1.to_string(), i + 1))
457					} else {
458						None
459					}
460				}
461				_ => None,
462			};
463			// 如果当前是 heading,后面紧跟不同标签且当前 heading 没有结束 range,补一个空结束(使序列化产生 </h3>)
464			if promote.is_none() && i + 1 < node.children.len() {
465				// safe split to avoid aliasing mutable/immutable borrows
466				let (left, right) = node.children.split_at_mut(i + 1); // left has up to i
467				if let Some(Node::Element(h)) = left.last_mut() {
468					if let Some(Node::Element(next_el)) = right.first() {
469						let hn = h.name();
470						if matches!(hn, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
471							&& hn != next_el.name()
472						{
473							if let Some(r) = h.range() {
474								if r.0 == r.1 {
475									h.set_range_end(r.0);
476								}
477							}
478						}
479					}
480				}
481			}
482			if let Some((_name, dup_idx)) = promote {
483				// 取出重复 heading
484				let mut dup = match node.children.remove(dup_idx) {
485					Node::Element(e) => e,
486					_ => unreachable!(),
487				};
488				// 过滤并提升其子节点:丢弃空元素(无属性 & 无子节点)
489				let insertion_pos = i + 1; // 在原第一 heading 之后依次插入
490				let mut promoted: Vec<Node> = Vec::new();
491				for child in dup.children.drain(..) {
492					let keep = match &child {
493						Node::Element(e) => {
494							let name = e.name();
495							// 仅过滤"空 div" (无属性且无子节点);其它标签即使空也保留(如 span)以匹配 js 行为
496							if name == "div" && e.raw_attrs.is_empty() && e.children.is_empty() {
497								false
498							} else {
499								true
500							}
501						}
502						_ => true,
503					};
504					if keep {
505						promoted.push(child);
506					}
507				}
508				for (offset, mut ch) in promoted.into_iter().enumerate() {
509					if let Node::Element(ref mut e) = ch {
510						e.parent = Some(node as *mut HTMLElement);
511					}
512					node.children.insert(insertion_pos + offset, ch);
513				}
514				// 若第一个 heading 仍为空且 range 未闭合 (start==end),为使序列化输出闭合标签,模拟设置结束位置
515				if let Node::Element(first_h) = &mut node.children[i] {
516					if matches!(first_h.name(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
517						if let Some((s, e)) = first_h.range() {
518							if s == e {
519								first_h.set_range_end(s + 1);
520							}
521						}
522					}
523				}
524				// 不递增 i,重新检查当前位置后续可能还有重复
525				continue;
526			}
527			i += 1;
528		}
529		// 递归
530		for child in node.children.iter_mut() {
531			if let Node::Element(e) = child {
532				promote_heading_duplicates(e);
533			}
534		}
535	}
536	let mut root_box = root;
537	promote_heading_duplicates(&mut root_box);
538	root_box
539}