Skip to main content

node_html_parser/dom/element/
main.rs

1use super::content::parse_fragment;
2use crate::dom::{node::Node, text::TextNode};
3use regex::Regex;
4use std::collections::HashMap;
5use std::fmt;
6
7#[cfg(feature = "parallel")]
8use rayon::prelude::*;
9
10#[derive(Debug, Clone)]
11pub struct HTMLElement {
12	pub(super) tag_name: Option<String>, // None for root container
13	pub(crate) raw_attrs: String, // original attribute string (escaped quoting preserved as much as possible)
14	pub attrs: Vec<(String, String)>, // lower-case key, decoded value
15	pub children: Vec<Node>,
16	pub(crate) parent: Option<*mut HTMLElement>,
17	// Whether this element is a void element (no closing tag) according to options at parse time
18	pub(super) is_void: bool,
19	// Whether serializer should append a closing slash (<br/>)
20	pub(super) void_add_slash: bool,
21	// caches for JS-style attribute APIs
22	pub(super) cache_raw_map: Option<HashMap<String, String>>, // original key -> raw (un-decoded) value or empty
23	pub(super) cache_lower_decoded: Option<HashMap<String, String>>, // lowercase key -> decoded
24	pub id: String,
25	pub(super) class_cache: Option<Vec<String>>, // lazily parsed class tokens
26	pub(super) range: Option<(usize, usize)>,    // (start,end)
27	// 是否已完整解析所有 attrs(延迟解析机制预留,当前解析器初始阶段可只解析部分如 id/class)
28	pub(crate) attrs_complete: bool,
29	// 是否属性已被修改(用于决定序列化时是否需要标准化引号)
30	pub(crate) attrs_modified: bool,
31	pub(crate) parse_comment: bool,
32	pub(crate) parse_lowercase: bool,
33}
34
35impl HTMLElement {
36	pub fn new(
37		tag: Option<String>,
38		raw_attrs: String,
39		attrs: Vec<(String, String)>,
40		is_void: bool,
41		void_add_slash: bool,
42	) -> Self {
43		// derive id from provided attrs vector if present (parity with JS ctor behavior #112)
44		let mut id_val = String::new();
45		for (k, v) in &attrs {
46			if k.eq_ignore_ascii_case("id") {
47				id_val = v.clone();
48				break;
49			}
50		}
51		Self {
52			tag_name: tag,
53			raw_attrs,
54			attrs,
55			// 🚀 优化:预分配children容量,减少重新分配
56			children: Vec::with_capacity(2),
57			parent: None,
58			is_void,
59			void_add_slash,
60			cache_raw_map: None,
61			cache_lower_decoded: None,
62
63			id: id_val,
64			class_cache: None,
65			range: None, // will set to Some((-1,-1)) for non-root below
66			attrs_complete: false,
67			attrs_modified: false,
68			parse_comment: false,
69			parse_lowercase: false,
70		}
71	}
72	// adopt_child(原 JS 未显式暴露;之前内部使用计划,现逻辑内联后移除)
73	pub fn is_root(&self) -> bool {
74		self.tag_name.is_none()
75	}
76	pub fn name(&self) -> &str {
77		self.tag_name.as_deref().unwrap_or("")
78	}
79	/// JS HTMLElement.tagName setter 行为:赋值后序列化使用小写(JS 内部存 rawTagName 小写,tagName getter 返回大写)。
80	/// 为贴近 JS,我们内部沿用小写存储,外部序列化 already 调用 self.name()(即原样)。
81	pub fn set_tag_name(&mut self, new_name: &str) {
82		let lowered = new_name.to_lowercase();
83		self.tag_name = Some(lowered);
84	}
85
86	// classList like helpers
87	pub fn raw_text(&self) -> String {
88		// JS 行为:如果是 <br> 则 rawText 为 "\n"
89		if !self.is_root() && self.name().eq_ignore_ascii_case("br") {
90			return "\n".to_string();
91		}
92		let mut buf = String::new();
93		for c in &self.children {
94			buf.push_str(&c.raw_text());
95		}
96		buf
97	}
98
99	pub fn class_names(&self) -> String {
100		self.get_attr("class").unwrap_or("").to_string()
101	}
102	pub fn inner_html(&self) -> String {
103		self.children.iter().map(|c| c.to_html()).collect()
104	}
105	/// 设置 innerHTML:清空旧子节点并以解析后的片段替换
106	pub fn set_inner_html(&mut self, html: &str) {
107		let mut nodes = parse_fragment(html);
108		if nodes.is_empty() {
109			// JS: 若解析后没有子节点,则使用一个 TextNode(content) 占位
110			nodes.push(Node::Text(TextNode::new(html.to_string())));
111		}
112		self.children.clear();
113		let self_ptr: *mut HTMLElement = self as *mut HTMLElement;
114		for n in nodes.iter_mut() {
115			if let Node::Element(e) = n {
116				e.parent = Some(self_ptr);
117			}
118		}
119		self.children.extend(nodes);
120	}
121
122	// ---- Selector match & closest (模拟 JS HTMLElement.closest) ----
123	/// 判断当前元素是否匹配 selector(使用全局选择再比对引用,性能次优)。
124	pub fn matches_selector<'a>(&'a self, root: &'a HTMLElement, selector: &str) -> bool {
125		// 利用已有 query_selector_all 从 root 选出全部匹配,再比较指针(与 JS Element.matches 行为等价)。
126		let matches = root.query_selector_all(selector);
127		let self_ptr = self as *const HTMLElement;
128		matches.iter().any(|e| *e as *const HTMLElement == self_ptr)
129	}
130	/// JS Element.matches(selector)
131	pub fn matches(&self, selector: &str) -> bool {
132		let root = self.root();
133		self.matches_selector(root, selector)
134	}
135	/// 获取当前树根元素(最外层容器)
136	pub fn root(&self) -> &HTMLElement {
137		let mut cur: &HTMLElement = self;
138		while let Some(p) = cur.parent() {
139			cur = p;
140		}
141		cur
142	}
143	/// JS closest(selector)
144	pub fn closest(&self, selector: &str) -> Option<&HTMLElement> {
145		let mut cur: Option<&HTMLElement> = Some(self);
146		while let Some(c) = cur {
147			if c.matches(selector) {
148				return Some(c);
149			}
150			cur = c.parent();
151		}
152		None
153	}
154	/// JS clone()
155	pub fn clone(&self) -> HTMLElement {
156		self.clone_node()
157	}
158
159	pub fn iter_elements<'a>(&'a self) -> impl Iterator<Item = &'a HTMLElement> + 'a {
160		self.children.iter().filter_map(|n| n.as_element())
161	}
162	pub fn query_selector_all<'a>(&'a self, selector: &str) -> Vec<&'a HTMLElement> {
163		crate::css_select::select_all(selector, self)
164	}
165	pub fn query_selector<'a>(&'a self, selector: &str) -> Option<&'a HTMLElement> {
166		self.query_selector_all(selector).into_iter().next()
167	}
168
169	pub fn remove_whitespace(&mut self) {
170		// 确保在删除文本节点前先完成全部属性解析,避免后续 rebuild_raw_attrs 丢失尚未延迟解析的属性(issue 274)
171		self.ensure_all_attrs();
172		let mut out = Vec::with_capacity(self.children.len());
173		for mut child in self.children.drain(..) {
174			match &mut child {
175				Node::Text(t) => {
176					let mut t2 = t.clone();
177					if !t2.is_whitespace() {
178						let new_raw = {
179							let _ = t2.trimmed_raw_text();
180							t2.trimmed_raw_text().to_string()
181						};
182						t2.set_raw(new_raw);
183						out.push(Node::Text(t2));
184					}
185				}
186				Node::Element(e) => {
187					let mut ec = e.clone();
188					ec.remove_whitespace();
189					out.push(Node::Element(ec));
190				}
191				Node::Comment(_) => {}
192			}
193		}
194		self.children = out;
195		self.rebuild_raw_attrs();
196	}
197
198	/// 模拟 JS HTMLElement.trimRight(pattern): 从右侧开始找到第一个匹配 TextNode 截断后续节点。
199	pub fn trim_right(&mut self, pattern: &Regex) {
200		let mut i = 0usize;
201		while i < self.children.len() {
202			match &mut self.children[i] {
203				Node::Element(e) => {
204					let mut ec = e.clone();
205					ec.trim_right(pattern);
206					self.children[i] = Node::Element(ec);
207				}
208				Node::Text(t) => {
209					if let Some(mat) = pattern.find(&t.raw) {
210						let new_raw = t.raw[..mat.start()].to_string();
211						let mut nt = t.clone();
212						nt.set_raw(new_raw);
213						self.children[i] = Node::Text(nt);
214						self.children.truncate(i + 1); // 截断后续
215						return;
216					}
217				}
218				Node::Comment(_) => {}
219			}
220			i += 1;
221		}
222	}
223
224	/// 输出结构字符串(对应 JS structure 属性)。
225	pub fn structure(&self) -> String {
226		let mut res = Vec::new();
227		fn dfs(cur: &HTMLElement, indent: usize, out: &mut Vec<String>) {
228			if cur.is_root() {
229				for child in &cur.children {
230					if let Node::Element(e) = child {
231						dfs(e, 0, out);
232					}
233				}
234				return;
235			}
236			let mut line = String::new();
237			line.push_str(&"  ".repeat(indent));
238			line.push_str(cur.name());
239			if !cur.id.is_empty() {
240				line.push('#');
241				line.push_str(&cur.id);
242			}
243			if let Some(cls) = cur.get_attr("class") {
244				if !cls.is_empty() {
245					// 去重,保持出现顺序
246					let mut seen = std::collections::HashSet::new();
247					for c in cls.split_whitespace() {
248						if seen.insert(c) {
249							line.push('.');
250							line.push_str(c);
251						}
252					}
253				}
254			}
255			out.push(line);
256			for child in &cur.children {
257				match child {
258					Node::Element(e) => dfs(e, indent + 1, out),
259					Node::Text(t) => {
260						if !t.is_whitespace() {
261							out.push(format!("{}#text", "  ".repeat(indent + 1)));
262						}
263					}
264					Node::Comment(_) => {}
265				}
266			}
267		}
268		dfs(self, 0, &mut res);
269		res.join("\n")
270	}
271	pub fn get_elements_by_tag_name<'a>(&'a self, tag: &str) -> Vec<&'a HTMLElement> {
272		let tgt = tag.to_lowercase();
273		let mut acc = Vec::new();
274		fn walk<'b>(cur: &'b HTMLElement, tgt: &str, acc: &mut Vec<&'b HTMLElement>) {
275			for c in &cur.children {
276				if let Node::Element(e) = c {
277					let inner = &**e;
278					if tgt == "*" || inner.name().eq_ignore_ascii_case(tgt) {
279						acc.push(inner);
280					}
281					walk(inner, tgt, acc);
282				}
283			}
284		}
285		walk(self, &tgt, &mut acc);
286		acc
287	}
288	pub fn get_element_by_id<'a>(&'a self, id: &str) -> Option<&'a HTMLElement> {
289		fn walk<'b>(cur: &'b HTMLElement, id: &str) -> Option<&'b HTMLElement> {
290			for c in &cur.children {
291				if let Node::Element(e) = c {
292					let inner = &**e;
293					if inner.get_attr("id") == Some(id) {
294						return Some(inner);
295					}
296					if let Some(f) = walk(inner, id) {
297						return Some(f);
298					}
299				}
300			}
301			None
302		}
303		walk(self, id)
304	}
305	pub fn get_element_by_id_mut<'a>(&'a mut self, id: &str) -> Option<&'a mut HTMLElement> {
306		fn walk<'b>(cur: &'b mut HTMLElement, id: &str) -> Option<&'b mut HTMLElement> {
307			for c in cur.children.iter_mut() {
308				if let Node::Element(e) = c {
309					// 优先使用缓存的 id 字段避免触发属性延迟解析
310					if e.id == id || e.get_attr("id") == Some(id) {
311						return Some(e);
312					}
313					if let Some(found) = walk(e, id) {
314						return Some(found);
315					}
316				}
317			}
318			None
319		}
320		walk(self, id)
321	}
322	pub fn clone_node(&self) -> HTMLElement {
323		fn clone_rec(el: &HTMLElement) -> Box<HTMLElement> {
324			let mut new = Box::new(HTMLElement {
325				tag_name: el.tag_name.clone(),
326				raw_attrs: el.raw_attrs.clone(),
327				attrs: el.attrs.clone(),
328				children: Vec::new(),
329				parent: None,
330				is_void: el.is_void,
331				void_add_slash: el.void_add_slash,
332				cache_raw_map: None,
333				cache_lower_decoded: None,
334
335				id: el.id.clone(),
336				class_cache: el.class_cache.clone(),
337				range: None,
338				attrs_complete: el.attrs_complete,
339				attrs_modified: el.attrs_modified,
340				parse_comment: el.parse_comment,
341				parse_lowercase: el.parse_lowercase,
342			});
343			for c in &el.children {
344				match c {
345					Node::Element(e) => new.children.push(Node::Element(clone_rec(e))),
346					Node::Text(t) => new.children.push(Node::Text(t.clone())),
347					Node::Comment(cm) => new.children.push(Node::Comment(cm.clone())),
348				};
349			}
350			new
351		}
352		*clone_rec(self)
353	}
354	/// 浅拷贝(不包含子节点)
355	pub fn clone_shallow(&self) -> HTMLElement {
356		HTMLElement {
357			tag_name: self.tag_name.clone(),
358			raw_attrs: self.raw_attrs.clone(),
359			attrs: self.attrs.clone(),
360			children: Vec::new(),
361			parent: None,
362			is_void: self.is_void,
363			void_add_slash: self.void_add_slash,
364			cache_raw_map: None,
365			cache_lower_decoded: None,
366
367			id: self.id.clone(),
368			class_cache: self.class_cache.clone(),
369			range: None,
370			attrs_complete: self.attrs_complete,
371			attrs_modified: self.attrs_modified,
372			parse_comment: self.parse_comment,
373			parse_lowercase: self.parse_lowercase,
374		}
375	}
376	pub fn set_range_start(&mut self, start: usize) {
377		match self.range {
378			Some((_, e)) => self.range = Some((start, e)),
379			None => self.range = Some((start, start)),
380		}
381	}
382	pub fn set_range_end(&mut self, end: usize) {
383		match self.range {
384			Some((s, _)) => self.range = Some((s, end)),
385			None => self.range = Some((end, end)),
386		}
387	}
388	pub fn range(&self) -> Option<(usize, usize)> {
389		self.range
390	}
391
392	/// 批量处理多个元素的属性解析 (启用parallel特性时使用rayon)
393	/// 注意:由于线程安全约束,暂时使用串行处理
394	#[cfg(feature = "parallel")]
395	pub fn batch_ensure_attributes_safe(elements: &mut [HTMLElement]) {
396		// 暂时使用串行处理以避免线程安全问题
397		for el in elements.iter_mut() {
398			el.ensure_all_attrs();
399		}
400	}
401
402	/// 并行处理文本节点(线程安全版本)
403	#[cfg(feature = "parallel")]
404	pub fn process_text_nodes_parallel(text_nodes: &mut [crate::dom::text::TextNode]) {
405		const PARALLEL_THRESHOLD: usize = 20;
406
407		if text_nodes.len() >= PARALLEL_THRESHOLD {
408			text_nodes.par_iter_mut().for_each(|node| {
409				// 只处理不涉及DOM结构修改的操作
410				let _ = node.is_whitespace();
411				let _ = node.trimmed_raw_text();
412			});
413		}
414	}
415}
416
417impl fmt::Display for HTMLElement {
418	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
419		write!(f, "{}", self.outer_html())
420	}
421}