node_html_parser/dom/element/
main.rs

1use super::content::parse_fragment;
2use crate::dom::{node::Node, text::TextNode};
3use regex::Regex;
4use std::collections::HashMap;
5use std::fmt;
6
7#[derive(Debug, Clone)]
8pub struct HTMLElement {
9	pub(super) tag_name: Option<String>, // None for root container
10	pub(crate) raw_attrs: String, // original attribute string (escaped quoting preserved as much as possible)
11	pub attrs: Vec<(String, String)>, // lower-case key, decoded value
12	pub children: Vec<Node>,
13	pub(crate) parent: Option<*mut HTMLElement>,
14	// Whether this element is a void element (no closing tag) according to options at parse time
15	pub(super) is_void: bool,
16	// Whether serializer should append a closing slash (<br/>)
17	pub(super) void_add_slash: bool,
18	// caches for JS-style attribute APIs
19	pub(super) cache_raw_map: Option<HashMap<String, String>>, // original key -> raw (un-decoded) value or empty
20	pub(super) cache_lower_decoded: Option<HashMap<String, String>>, // lowercase key -> decoded
21	pub id: String,
22	pub(super) class_cache: Option<Vec<String>>, // lazily parsed class tokens
23	pub(super) range: Option<(usize, usize)>,    // (start,end)
24	// 是否已完整解析所有 attrs(延迟解析机制预留,当前解析器初始阶段可只解析部分如 id/class)
25	pub(crate) attrs_complete: bool,
26	pub(crate) parse_comment: bool,
27	pub(crate) parse_lowercase: bool,
28}
29
30impl HTMLElement {
31	pub fn new(
32		tag: Option<String>,
33		raw_attrs: String,
34		attrs: Vec<(String, String)>,
35		is_void: bool,
36		void_add_slash: bool,
37	) -> Self {
38		// derive id from provided attrs vector if present (parity with JS ctor behavior #112)
39		let mut id_val = String::new();
40		for (k, v) in &attrs {
41			if k.eq_ignore_ascii_case("id") {
42				id_val = v.clone();
43				break;
44			}
45		}
46		Self {
47			tag_name: tag,
48			raw_attrs,
49			attrs,
50			children: Vec::new(),
51			parent: None,
52			is_void,
53			void_add_slash,
54			cache_raw_map: None,
55			cache_lower_decoded: None,
56			id: id_val,
57			class_cache: None,
58			range: None, // will set to Some((-1,-1)) for non-root below
59			attrs_complete: false,
60			parse_comment: false,
61			parse_lowercase: false,
62		}
63	}
64	// adopt_child(原 JS 未显式暴露;之前内部使用计划,现逻辑内联后移除)
65	pub fn is_root(&self) -> bool {
66		self.tag_name.is_none()
67	}
68	pub fn name(&self) -> &str {
69		self.tag_name.as_deref().unwrap_or("")
70	}
71	/// JS HTMLElement.tagName setter 行为:赋值后序列化使用小写(JS 内部存 rawTagName 小写,tagName getter 返回大写)。
72	/// 为贴近 JS,我们内部沿用小写存储,外部序列化 already 调用 self.name()(即原样)。
73	pub fn set_tag_name(&mut self, new_name: &str) {
74		let lowered = new_name.to_lowercase();
75		self.tag_name = Some(lowered);
76	}
77
78	// classList like helpers
79	pub fn raw_text(&self) -> String {
80		// JS 行为:如果是 <br> 则 rawText 为 "\n"
81		if !self.is_root() && self.name().eq_ignore_ascii_case("br") {
82			return "\n".to_string();
83		}
84		let mut buf = String::new();
85		for c in &self.children {
86			buf.push_str(&c.raw_text());
87		}
88		buf
89	}
90
91	pub fn class_names(&self) -> String {
92		self.get_attr("class").unwrap_or("").to_string()
93	}
94	pub fn inner_html(&self) -> String {
95		self.children.iter().map(|c| c.to_html()).collect()
96	}
97	/// 设置 innerHTML:清空旧子节点并以解析后的片段替换
98	pub fn set_inner_html(&mut self, html: &str) {
99		let mut nodes = parse_fragment(html);
100		if nodes.is_empty() {
101			// JS: 若解析后没有子节点,则使用一个 TextNode(content) 占位
102			nodes.push(Node::Text(TextNode::new(html.to_string())));
103		}
104		self.children.clear();
105		let self_ptr: *mut HTMLElement = self as *mut HTMLElement;
106		for n in nodes.iter_mut() {
107			if let Node::Element(e) = n {
108				e.parent = Some(self_ptr);
109			}
110		}
111		self.children.extend(nodes);
112	}
113
114	// ---- Selector match & closest (模拟 JS HTMLElement.closest) ----
115	/// 判断当前元素是否匹配 selector(使用全局选择再比对引用,性能次优)。
116	pub fn matches_selector<'a>(&'a self, root: &'a HTMLElement, selector: &str) -> bool {
117		// 利用已有 query_selector_all 从 root 选出全部匹配,再比较指针(与 JS Element.matches 行为等价)。
118		let matches = root.query_selector_all(selector);
119		let self_ptr = self as *const HTMLElement;
120		matches.iter().any(|e| *e as *const HTMLElement == self_ptr)
121	}
122	/// JS Element.matches(selector)
123	pub fn matches(&self, selector: &str) -> bool {
124		let root = self.root();
125		self.matches_selector(root, selector)
126	}
127	/// 获取当前树根元素(最外层容器)
128	pub fn root(&self) -> &HTMLElement {
129		let mut cur: &HTMLElement = self;
130		while let Some(p) = cur.parent() {
131			cur = p;
132		}
133		cur
134	}
135	/// JS closest(selector)
136	pub fn closest(&self, selector: &str) -> Option<&HTMLElement> {
137		let mut cur: Option<&HTMLElement> = Some(self);
138		while let Some(c) = cur {
139			if c.matches(selector) {
140				return Some(c);
141			}
142			cur = c.parent();
143		}
144		None
145	}
146	/// JS clone()
147	pub fn clone(&self) -> HTMLElement {
148		self.clone_node()
149	}
150
151	pub fn iter_elements<'a>(&'a self) -> impl Iterator<Item = &'a HTMLElement> + 'a {
152		self.children.iter().filter_map(|n| n.as_element())
153	}
154	pub fn query_selector_all<'a>(&'a self, selector: &str) -> Vec<&'a HTMLElement> {
155		crate::css_select::select_all(selector, self)
156	}
157	pub fn query_selector<'a>(&'a self, selector: &str) -> Option<&'a HTMLElement> {
158		self.query_selector_all(selector).into_iter().next()
159	}
160
161	pub fn remove_whitespace(&mut self) {
162		// 确保在删除文本节点前先完成全部属性解析,避免后续 rebuild_raw_attrs 丢失尚未延迟解析的属性(issue 274)
163		self.ensure_all_attrs();
164		let mut out = Vec::with_capacity(self.children.len());
165		for mut child in self.children.drain(..) {
166			match &mut child {
167				Node::Text(t) => {
168					let mut t2 = t.clone();
169					if !t2.is_whitespace() {
170						let new_raw = {
171							let _ = t2.trimmed_raw_text();
172							t2.trimmed_raw_text().to_string()
173						};
174						t2.set_raw(new_raw);
175						out.push(Node::Text(t2));
176					}
177				}
178				Node::Element(e) => {
179					let mut ec = e.clone();
180					ec.remove_whitespace();
181					out.push(Node::Element(ec));
182				}
183				Node::Comment(_) => {}
184			}
185		}
186		self.children = out;
187		self.rebuild_raw_attrs();
188	}
189
190	/// 模拟 JS HTMLElement.trimRight(pattern): 从右侧开始找到第一个匹配 TextNode 截断后续节点。
191	pub fn trim_right(&mut self, pattern: &Regex) {
192		let mut i = 0usize;
193		while i < self.children.len() {
194			match &mut self.children[i] {
195				Node::Element(e) => {
196					let mut ec = e.clone();
197					ec.trim_right(pattern);
198					self.children[i] = Node::Element(ec);
199				}
200				Node::Text(t) => {
201					if let Some(mat) = pattern.find(&t.raw) {
202						let new_raw = t.raw[..mat.start()].to_string();
203						let mut nt = t.clone();
204						nt.set_raw(new_raw);
205						self.children[i] = Node::Text(nt);
206						self.children.truncate(i + 1); // 截断后续
207						return;
208					}
209				}
210				Node::Comment(_) => {}
211			}
212			i += 1;
213		}
214	}
215
216	/// 输出结构字符串(对应 JS structure 属性)。
217	pub fn structure(&self) -> String {
218		let mut res = Vec::new();
219		fn dfs(cur: &HTMLElement, indent: usize, out: &mut Vec<String>) {
220			if cur.is_root() {
221				for child in &cur.children {
222					if let Node::Element(e) = child {
223						dfs(e, 0, out);
224					}
225				}
226				return;
227			}
228			let mut line = String::new();
229			line.push_str(&"  ".repeat(indent));
230			line.push_str(cur.name());
231			if !cur.id.is_empty() {
232				line.push('#');
233				line.push_str(&cur.id);
234			}
235			if let Some(cls) = cur.get_attr("class") {
236				if !cls.is_empty() {
237					// 去重,保持出现顺序
238					let mut seen = std::collections::HashSet::new();
239					for c in cls.split_whitespace() {
240						if seen.insert(c) {
241							line.push('.');
242							line.push_str(c);
243						}
244					}
245				}
246			}
247			out.push(line);
248			for child in &cur.children {
249				match child {
250					Node::Element(e) => dfs(e, indent + 1, out),
251					Node::Text(t) => {
252						if !t.is_whitespace() {
253							out.push(format!("{}#text", "  ".repeat(indent + 1)));
254						}
255					}
256					Node::Comment(_) => {}
257				}
258			}
259		}
260		dfs(self, 0, &mut res);
261		res.join("\n")
262	}
263	pub fn get_elements_by_tag_name<'a>(&'a self, tag: &str) -> Vec<&'a HTMLElement> {
264		let tgt = tag.to_lowercase();
265		let mut acc = Vec::new();
266		fn walk<'b>(cur: &'b HTMLElement, tgt: &str, acc: &mut Vec<&'b HTMLElement>) {
267			for c in &cur.children {
268				if let Node::Element(e) = c {
269					let inner = &**e;
270					if tgt == "*" || inner.name().eq_ignore_ascii_case(tgt) {
271						acc.push(inner);
272					}
273					walk(inner, tgt, acc);
274				}
275			}
276		}
277		walk(self, &tgt, &mut acc);
278		acc
279	}
280	pub fn get_element_by_id<'a>(&'a self, id: &str) -> Option<&'a HTMLElement> {
281		fn walk<'b>(cur: &'b HTMLElement, id: &str) -> Option<&'b HTMLElement> {
282			for c in &cur.children {
283				if let Node::Element(e) = c {
284					let inner = &**e;
285					if inner.get_attr("id") == Some(id) {
286						return Some(inner);
287					}
288					if let Some(f) = walk(inner, id) {
289						return Some(f);
290					}
291				}
292			}
293			None
294		}
295		walk(self, id)
296	}
297	pub fn get_element_by_id_mut<'a>(&'a mut self, id: &str) -> Option<&'a mut HTMLElement> {
298		fn walk<'b>(cur: &'b mut HTMLElement, id: &str) -> Option<&'b mut HTMLElement> {
299			for c in cur.children.iter_mut() {
300				if let Node::Element(e) = c {
301					// 优先使用缓存的 id 字段避免触发属性延迟解析
302					if e.id == id || e.get_attr("id") == Some(id) {
303						return Some(e);
304					}
305					if let Some(found) = walk(e, id) {
306						return Some(found);
307					}
308				}
309			}
310			None
311		}
312		walk(self, id)
313	}
314	pub fn clone_node(&self) -> HTMLElement {
315		fn clone_rec(el: &HTMLElement) -> Box<HTMLElement> {
316			let mut new = Box::new(HTMLElement {
317				tag_name: el.tag_name.clone(),
318				raw_attrs: el.raw_attrs.clone(),
319				attrs: el.attrs.clone(),
320				children: Vec::new(),
321				parent: None,
322				is_void: el.is_void,
323				void_add_slash: el.void_add_slash,
324				cache_raw_map: None,
325				cache_lower_decoded: None,
326				id: el.id.clone(),
327				class_cache: el.class_cache.clone(),
328				range: None,
329				attrs_complete: el.attrs_complete,
330				parse_comment: el.parse_comment,
331				parse_lowercase: el.parse_lowercase,
332			});
333			for c in &el.children {
334				match c {
335					Node::Element(e) => new.children.push(Node::Element(clone_rec(e))),
336					Node::Text(t) => new.children.push(Node::Text(t.clone())),
337					Node::Comment(cm) => new.children.push(Node::Comment(cm.clone())),
338				};
339			}
340			new
341		}
342		*clone_rec(self)
343	}
344	/// 浅拷贝(不包含子节点)
345	pub fn clone_shallow(&self) -> HTMLElement {
346		HTMLElement {
347			tag_name: self.tag_name.clone(),
348			raw_attrs: self.raw_attrs.clone(),
349			attrs: self.attrs.clone(),
350			children: Vec::new(),
351			parent: None,
352			is_void: self.is_void,
353			void_add_slash: self.void_add_slash,
354			cache_raw_map: None,
355			cache_lower_decoded: None,
356			id: self.id.clone(),
357			class_cache: self.class_cache.clone(),
358			range: None,
359			attrs_complete: self.attrs_complete,
360			parse_comment: self.parse_comment,
361			parse_lowercase: self.parse_lowercase,
362		}
363	}
364	pub fn set_range_start(&mut self, start: usize) {
365		match self.range {
366			Some((_, e)) => self.range = Some((start, e)),
367			None => self.range = Some((start, start)),
368		}
369	}
370	pub fn set_range_end(&mut self, end: usize) {
371		match self.range {
372			Some((s, _)) => self.range = Some((s, end)),
373			None => self.range = Some((end, end)),
374		}
375	}
376	pub fn range(&self) -> Option<(usize, usize)> {
377		self.range
378	}
379}
380
381impl fmt::Display for HTMLElement {
382	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
383		write!(f, "{}", self.outer_html())
384	}
385}