node-html-parser 0.1.1

Fast HTML parser for Rust & WASM producing a lightweight DOM with CSS selector querying.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
//! Core HTML parsing engine with optimized zero-copy implementation.

use crate::dom::comment::CommentNode;
use crate::dom::element::HTMLElement;
use crate::dom::node::Node;
use crate::dom::text::TextNode;
use crate::dom::void_tag::VoidTag;
use regex::Regex;
use std::collections::HashMap;
use std::sync::OnceLock;

use super::attrs::parse_id_class_attrs_fast;
use super::fast_parser::parse_tags_zero_copy;
use super::types::{Options, StackEntry};
use super::utils::{find_closing_tag_case_insensitive, strip_trailing_self_close_optimized};

// 缓存编译好的正则表达式以避免重复编译
static TAG_REGEX: OnceLock<Regex> = OnceLock::new();
static ATTR_KEY_REGEX: OnceLock<Regex> = OnceLock::new();

pub fn parse_with_options(input: &str, opts: &Options) -> Box<HTMLElement> {
	// 改进:
	// 1) 支持引号内含 '>'(通过 attrs 子模式保证只有引号外的 '>' 终止标签)
	// 2) 扩展 tagName 的 Unicode 范围以对齐 JS 版本 kMarkupPattern,使自定义标签(含中文、阿拉伯等 BMP 范围字符)通过。
	//    JS 的模式包含大量 Unicode 范围及高位平面;Rust regex 不支持 >BMP 直接类,这里截取到 BMP(满足当前测试需求)。
	// 缓存编译好的正则表达式以避免重复编译(预留用于其他功能)
	let _tag_re = TAG_REGEX.get_or_init(|| {
		Regex::new(r#"<!--(?s:.*?)-->|<(\/)?([A-Za-z][A-Za-z0-9._:@\-\p{L}\p{M}]*)(?P<attrs>(?:[^>"']|"[^"]*"|'[^']*')*)(/?)>"#).unwrap()
	});
	let _attr_key_re = ATTR_KEY_REGEX.get_or_init(|| {
		Regex::new(
			r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|[^\s>]+))?"#,
		)
		.unwrap()
	});
	let void_tag = VoidTag::new(&opts.void_tag);

	// Elements closed by opening of specific following tags (subset of JS map)
	let mut closed_by_open: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
	// helper: insert mapping for both lower & upper parent names (mirror JS maps having LI & li etc.)
	macro_rules! ins_open {
		($parent:expr, [ $($v:expr),* ]) => {{
			closed_by_open.insert($parent, vec![$($v),*]);
			let up = $parent.to_uppercase();
			if up != $parent { closed_by_open.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
		}};
	}
	ins_open!("li", ["li", "LI"]);
	ins_open!("p", ["p", "P", "div", "DIV"]);
	ins_open!("b", ["div", "DIV"]);
	ins_open!("td", ["td", "th", "TD", "TH"]);
	ins_open!("th", ["td", "th", "TD", "TH"]);
	ins_open!("h1", ["h1", "H1"]);
	ins_open!("h2", ["h2", "H2"]);
	ins_open!("h3", ["h3", "H3"]);
	ins_open!("h4", ["h4", "H4"]);
	ins_open!("h5", ["h5", "H5"]);
	ins_open!("h6", ["h6", "H6"]);

	let mut closed_by_close: HashMap<&'static str, Vec<&'static str>> = HashMap::new();
	macro_rules! ins_close {
		($parent:expr, [ $($v:expr),* ]) => {{
			closed_by_close.insert($parent, vec![$($v),*]);
			let up = $parent.to_uppercase();
			if up != $parent { closed_by_close.insert(Box::leak(up.into_boxed_str()), vec![$($v),*]); }
		}};
	}
	ins_close!("li", ["ul", "UL", "ol", "OL"]);
	ins_close!("a", ["div", "DIV"]);
	ins_close!("b", ["div", "DIV"]);
	ins_close!("i", ["div", "DIV"]);
	ins_close!("p", ["div", "DIV"]);
	ins_close!("td", ["tr", "TR", "table", "TABLE"]);
	ins_close!("th", ["tr", "TR", "table", "TABLE"]);

	let root = Box::new(HTMLElement::new(
		None,
		String::new(),
		Vec::new(),
		false,
		opts.void_tag.add_closing_slash,
	));
	// 🚀 优化:预分配栈容量,减少深层嵌套时的重新分配
	let mut stack: Vec<StackEntry> = Vec::with_capacity(32);
	stack.push(StackEntry { elem: root });
	// Safe mutable access to set options flags on root
	if let Some(first) = stack.last_mut() {
		first.elem.parse_comment = opts.comment;
		first.elem.parse_lowercase = opts.lower_case_tag_name;
	}
	let mut no_nested_a_index: Option<usize> = None;

	let frameflag = "documentfragmentcontainer";
	let frame_prefix = format!("<{}>", frameflag);
	let frame_suffix = format!("</{}>", frameflag);
	let data = format!("{}{}{}", frame_prefix, input, frame_suffix);
	let frame_offset = frame_prefix.len();

	// 🔥 使用零拷贝手写解析器:消除83%字符串分配开销,提升性能
	let tag_matches = parse_tags_zero_copy(&data);
	let mut match_index = 0;
	let mut last_text_pos = 0usize;

	// 记录在 block 文本处理阶段被直接消耗的关闭标签位置
	let mut skipped_closing_starts: std::collections::HashSet<usize> =
		std::collections::HashSet::new();

	// 遍历所有解析出的标签
	while match_index < tag_matches.len() {
		let tag_match = &tag_matches[match_index];
		match_index += 1;

		// 若该匹配为已消耗的 blockTextElements 的关闭标签,直接跳过
		if skipped_closing_starts.contains(&tag_match.start) {
			continue;
		}

		let match_start = tag_match.start;
		let is_comment = tag_match.is_comment;
		let _leading_slash = if tag_match.is_closing { "/" } else { "" };
		let tag_name = tag_match.tag_name.to_string(); // 零拷贝版本:从&str转为String
		let raw_attr_part = &tag_match.attrs;
		let _trailing_self_close = tag_match.self_closing;
		// 🚀 优化:使用预计算的自闭合标记检测
		let (attr_part, trailing_self_close) = strip_trailing_self_close_optimized(raw_attr_part);
		// 🚀 优化:将文本内容处理延迟,使用切片而不是立即克隆
		if match_start > last_text_pos {
			let text_slice = &data[last_text_pos..match_start];
			if !text_slice.is_empty() {
				let top = stack.last_mut().unwrap();
				let start_src = last_text_pos.saturating_sub(frame_offset);
				let end_src = match_start.saturating_sub(frame_offset);
				// 只在实际需要时才分配字符串
				top.elem.children.push(Node::Text(TextNode::with_range(
					text_slice.to_string(), // 仍需分配,但至少减少了中间变量
					start_src,
					end_src,
				)));
			}
		}
		last_text_pos = tag_match.end;
		if is_comment {
			if opts.comment {
				// 从完整的data中提取注释内容
				let full_comment = &data[tag_match.start..tag_match.end];
				let inner = full_comment
					.trim_start_matches("<!--")
					.trim_end_matches("-->");
				let top = stack.last_mut().unwrap();
				// 记录注释在原输入中的范围(包含 <!-- --> 符号),与元素/文本一样使用 (start,end) 半开区间
				let start_src = match_start.saturating_sub(frame_offset);
				let end_src = tag_match.end.saturating_sub(frame_offset);
				top.elem
					.children
					.push(Node::Comment(CommentNode::with_range(
						inner.to_string(),
						start_src,
						end_src,
					)));
			}
			continue;
		}
		if tag_name == frameflag {
			continue;
		}

		// 🔥 优化:统一计算小写版本,避免重复转换
		let lower_tag_name = tag_name.to_lowercase(); // 零拷贝版本需要转换为String用于比较

		let final_tag_name = if opts.lower_case_tag_name {
			&lower_tag_name
		} else {
			&tag_name
		};

		let mut self_closing = trailing_self_close
			|| attr_part.trim_end().ends_with('/') // 保险:兼容旧逻辑(理论上已剥离)
            || void_tag.is_void(final_tag_name);

		if !tag_match.is_closing {
			// opening tag
			// auto close logic (original behavior: just pop parent when needed)
			if !opts.preserve_tag_nesting {
				if let Some(parent) = stack.last() {
					if let Some(list) = closed_by_open.get(parent.elem.name()) {
						if list
							.iter()
							.any(|t| **t == tag_name || *t == tag_name.to_uppercase())
						{
							if stack.len() > 1 {
								let closed = stack.pop().unwrap();
								stack
									.last_mut()
									.unwrap()
									.elem
									.children
									.push(Node::Element(closed.elem));
							}
						}
					}
				}
			}
			// fix nested A tags
			if opts.fix_nested_a_tags && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
				if let Some(idx) = no_nested_a_index {
					while stack.len() > idx {
						let closed = stack.pop().unwrap();
						stack
							.last_mut()
							.unwrap()
							.elem
							.children
							.push(Node::Element(closed.elem));
					}
				}
				no_nested_a_index = Some(stack.len());
			}

			// 🚀 优化:使用高效的手写属性解析器替代正则表达式
			let (attrs, saw_other_attr) = parse_id_class_attrs_fast(&attr_part);
			// 🚀 优化:避免不必要的字符串克隆
			let raw_attr_string = if attr_part.starts_with(' ') && attr_part.len() > 1 {
				attr_part[1..].to_string()
			} else {
				attr_part.to_string()
			};
			let mut elem = Box::new(HTMLElement::new(
				Some(final_tag_name.to_string()),
				raw_attr_string,
				attrs,
				self_closing && void_tag.is_void(final_tag_name),
				opts.void_tag.add_closing_slash,
			));
			// range 将在真正闭合或自闭合时由 set_range_start / set_range_end 赋值
			// 记录解析选项供后续 set_content 继承
			elem.parse_comment = opts.comment;
			elem.parse_lowercase = opts.lower_case_tag_name;
			if saw_other_attr {
				elem.attrs_complete = false;
			}
			let open_start = match_start.saturating_sub(frame_offset);
			let open_end = tag_match.end.saturating_sub(frame_offset);
			// provisional start
			elem.set_range_start(open_start);
			if self_closing {
				// for self-closing tag finalize range
				elem.set_range_end(open_end);
			}
			// 解析阶段填充 id 字段,便于后续 structure / closest 等直接使用(与 JS keyAttrs.id 行为对齐)
			if let Some((_, v)) = elem.attrs.iter().find(|(k, _)| k == "id") {
				elem.id = v.clone();
			}

			// block text element handling: capture inner text verbatim until closing tag
			if let Some(extract) = opts.block_text_elements.get(&lower_tag_name) {
				// 查找对应关闭标签(大小写不敏感,参考 JS 逻辑)
				let close_markup = format!("</{}>", final_tag_name);
				let search_slice = &data[last_text_pos..];

				// 🚀 优化:使用高效的大小写不敏感搜索,避免创建整个文档的小写副本
				if let Some(rel) = find_closing_tag_case_insensitive(search_slice, &close_markup) {
					let close_start = last_text_pos + rel; // 关闭标签 '<' 起始位置
					let suppress = opts.suppress_script_style_text
						&& (lower_tag_name == "script" || lower_tag_name == "style");
					if *extract && !suppress {
						// true -> 提取文本(除非被全局抑制)
						let inner_text = &data[last_text_pos..close_start];
						if !inner_text.is_empty() {
							let inner_start = last_text_pos.saturating_sub(frame_offset);
							let inner_end = close_start.saturating_sub(frame_offset);
							elem.children.push(Node::Text(TextNode::with_range(
								inner_text.to_string(),
								inner_start,
								inner_end,
							)));
						}
					}
					// 无论是否提取文本,都跳过关闭标签匹配:记录其起始位置,供主循环跳过
					skipped_closing_starts.insert(close_start);
					last_text_pos = close_start + close_markup.len();
					let close_end_src =
						(close_start + close_markup.len()).saturating_sub(frame_offset);
					elem.set_range_end(close_end_src);
					self_closing = true; // 强制立即闭合
				} else {
					// 未找到关闭标签:与 JS 一致,标记 last_text_pos 到末尾避免后续文本节点重复
					last_text_pos = data.len() + 1;
				}
			}
			// 若因 block_textElements 强制 self_closing,将 elem.children 保持现状并不入栈

			if self_closing {
				let parent = stack.last_mut().unwrap();
				let parent_ptr: *mut HTMLElement = &mut *parent.elem;
				elem.parent = Some(parent_ptr);
				parent.elem.children.push(Node::Element(elem));
			} else {
				let parent_ptr: *mut HTMLElement = &mut *stack.last_mut().unwrap().elem;
				elem.parent = Some(parent_ptr);
				stack.push(StackEntry { elem });
			}
		} else {
			// closing tag
			// remove nested a index if closing A
			if no_nested_a_index.is_some() && (final_tag_name.eq("a") || final_tag_name.eq("A")) {
				no_nested_a_index = None;
			}
			// 🚀 优化:使用预计算的小写版本
			let target = &lower_tag_name;
			// try to find matching open
			let mut i = stack.len();
			while i > 1 {
				// skip root at 0
				i -= 1;
				if stack[i].elem.name().eq_ignore_ascii_case(target) {
					while stack.len() > i + 1 {
						let closed = stack.pop().unwrap();
						let parent = stack.last_mut().unwrap();
						let mut e = closed.elem;
						let parent_ptr: *mut HTMLElement = &mut *parent.elem;
						e.parent = Some(parent_ptr);
						let close_end = tag_match.end.saturating_sub(frame_offset);
						if e.range().is_some() {
							e.set_range_end(close_end);
						}
						parent.elem.children.push(Node::Element(e));
					}
					let closed = stack.pop().unwrap();
					let parent = stack.last_mut().unwrap();
					let mut e = closed.elem;
					let parent_ptr: *mut HTMLElement = &mut *parent.elem;
					e.parent = Some(parent_ptr);
					let close_end_main = tag_match.end.saturating_sub(frame_offset);
					if e.range().is_some() {
						e.set_range_end(close_end_main);
					}
					parent.elem.children.push(Node::Element(e));
					break;
				} else {
					// aggressive strategy: if parent would be auto-closed by this closing tag
					let parent_name = stack[i].elem.name().to_lowercase();
					if let Some(list) = closed_by_close.get(parent_name.as_str()) {
						if list.iter().any(|x| x.eq_ignore_ascii_case(&tag_name)) {
							let closed = stack.pop().unwrap();
							let parent = stack.last_mut().unwrap();
							let mut e = closed.elem;
							let parent_ptr: *mut HTMLElement = &mut *parent.elem;
							e.parent = Some(parent_ptr);
							parent.elem.children.push(Node::Element(e));
							continue;
						}
					}
				}
			}
		}
	}
	// trailing text if any
	if last_text_pos < data.len() {
		let text = &data[last_text_pos..];
		if !text.is_empty() {
			let top = stack.last_mut().unwrap();
			let start_src = last_text_pos.saturating_sub(frame_offset);
			let end_src = data.len().saturating_sub(frame_offset);
			top.elem.children.push(Node::Text(TextNode::with_range(
				text.to_string(),
				start_src,
				end_src,
			)));
		}
	}
	// unwind unless parse_none_closed_tags keeps them as-is
	if opts.parse_none_closed_tags {
		// 保持原样直接线性挂接
		while stack.len() > 1 {
			let closed = stack.pop().unwrap();
			let parent = stack.last_mut().unwrap();
			let mut e = closed.elem;
			let parent_ptr: *mut HTMLElement = &mut *parent.elem;
			e.parent = Some(parent_ptr);
			parent.elem.children.push(Node::Element(e));
		}
		let root = stack.pop().unwrap().elem;
		return root;
	}
	// JS parse() 错误修复阶段:处理 pair error 与 single error
	// stack[0] 为 root, 其余为未闭合链。模拟 JS: while stack.length > 1 { let last = pop(); let oneBefore = back(); ... }
	while stack.len() > 1 {
		let last = stack.pop().unwrap();
		let one_before = stack.last_mut().unwrap();
		let mut last_elem = last.elem; // detached error element
								 // 简化 pair error 逻辑:若相邻未闭合标签标签名相同,视为 pair(与 JS 修复阶段行为保持)
		let is_pair = last_elem.name() == one_before.elem.name();
		if is_pair {
			// pair error 修复:移除 last,将其子节点上提到 oneBefore 的父级
			// 这里简化:直接附加到 one_before.elem 的父(若存在)
			// pair 修复:移除 one_before 的重复子元素,将 last 的子节点上移到 one_before 的父级
			if let Some(one_parent_ptr) = one_before.elem.parent {
				unsafe {
					let one_parent = &mut *one_parent_ptr;
					one_parent.remove_children_where(
						|n| matches!(n, Node::Element(e) if e.name()==last_elem.name()),
					);
					for mut child in last_elem.children.drain(..) {
						if let Node::Element(e) = &mut child {
							let parent_ptr: *mut HTMLElement = one_parent_ptr;
							e.parent = Some(parent_ptr);
						}
						one_parent.children.push(child);
					}
				}
			}
			continue;
		}
		// single error: remove last but keep its children inside one_before
		// 但若这是文件尾部未闭合的正常标签(比如 <div><p>abc),应保留 last_elem 并设置其 range 结束到末尾
		let end_fix_needed =
			last_elem.range().is_some() && last_elem.range().unwrap().1 < input.len();
		if end_fix_needed {
			// finalize range end to input end
			last_elem.set_range_end(input.len());
			// 挂回 one_before
			let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
			last_elem.parent = Some(parent_ptr);
			one_before.elem.children.push(Node::Element(last_elem));
			continue;
		} else {
			let target_name = last_elem.name().to_string();
			one_before
				.elem
				.remove_children_where(|n| matches!(n, Node::Element(e) if e.name()==target_name));
			for mut child in last_elem.children.drain(..) {
				match &mut child {
					Node::Element(e) => {
						let parent_ptr: *mut HTMLElement = &mut *one_before.elem;
						e.parent = Some(parent_ptr);
					}
					_ => {}
				}
				one_before.elem.children.push(child);
			}
		}
	}
	// 最终收束:stack 剩 root
	let root = stack.pop().unwrap().elem;
	// 后处理:相邻重复 heading(h1-h6)时,将后者子节点(非空内容)提升为前者之后的兄弟,并移除后者。
	// 目的:复刻 JS parse() 在修复相邻或嵌套 heading 过程中产生的展平效果(见 tests 中 h3 链相关用例)。
	fn promote_heading_duplicates(node: &mut HTMLElement) {
		use crate::dom::node::Node;
		let mut i = 0;
		while i + 1 < node.children.len() {
			let promote = match (&node.children[i], &node.children[i + 1]) {
				(Node::Element(a), Node::Element(b)) => {
					let n1 = a.name();
					let n2 = b.name();
					if n1 == n2 && matches!(n1, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
						Some((n1.to_string(), i + 1))
					} else {
						None
					}
				}
				_ => None,
			};
			// 如果当前是 heading,后面紧跟不同标签且当前 heading 没有结束 range,补一个空结束(使序列化产生 </h3>)
			if promote.is_none() && i + 1 < node.children.len() {
				// safe split to avoid aliasing mutable/immutable borrows
				let (left, right) = node.children.split_at_mut(i + 1); // left has up to i
				if let Some(Node::Element(h)) = left.last_mut() {
					if let Some(Node::Element(next_el)) = right.first() {
						let hn = h.name();
						if matches!(hn, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
							&& hn != next_el.name()
						{
							if let Some(r) = h.range() {
								if r.0 == r.1 {
									h.set_range_end(r.0);
								}
							}
						}
					}
				}
			}
			if let Some((_name, dup_idx)) = promote {
				// 取出重复 heading
				let mut dup = match node.children.remove(dup_idx) {
					Node::Element(e) => e,
					_ => unreachable!(),
				};
				// 过滤并提升其子节点:丢弃空元素(无属性 & 无子节点)
				let insertion_pos = i + 1; // 在原第一 heading 之后依次插入
				let mut promoted: Vec<Node> = Vec::new();
				for child in dup.children.drain(..) {
					let keep = match &child {
						Node::Element(e) => {
							let name = e.name();
							// 仅过滤"空 div" (无属性且无子节点);其它标签即使空也保留(如 span)以匹配 js 行为
							if name == "div" && e.raw_attrs.is_empty() && e.children.is_empty() {
								false
							} else {
								true
							}
						}
						_ => true,
					};
					if keep {
						promoted.push(child);
					}
				}
				for (offset, mut ch) in promoted.into_iter().enumerate() {
					if let Node::Element(ref mut e) = ch {
						e.parent = Some(node as *mut HTMLElement);
					}
					node.children.insert(insertion_pos + offset, ch);
				}
				// 若第一个 heading 仍为空且 range 未闭合 (start==end),为使序列化输出闭合标签,模拟设置结束位置
				if let Node::Element(first_h) = &mut node.children[i] {
					if matches!(first_h.name(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
						if let Some((s, e)) = first_h.range() {
							if s == e {
								first_h.set_range_end(s + 1);
							}
						}
					}
				}
				// 不递增 i,重新检查当前位置后续可能还有重复
				continue;
			}
			i += 1;
		}
		// 递归
		for child in node.children.iter_mut() {
			if let Node::Element(e) = child {
				promote_heading_duplicates(e);
			}
		}
	}
	let mut root_box = root;
	promote_heading_duplicates(&mut root_box);
	root_box
}