Skip to main content

node_html_parser/dom/element/
attributes.rs

1use super::main::HTMLElement;
2use regex::Regex;
3use std::collections::HashMap;
4use std::sync::OnceLock;
5
6// 缓存属性解析相关的正则表达式
7static ATTR_PARSE_REGEX: OnceLock<Regex> = OnceLock::new();
8// 缓存 quote_attr 相关正则,避免重复编译
9static QUOTE_ESCAPED_CTRL_REGEX: OnceLock<Regex> = OnceLock::new();
10static QUOTE_STRIP_BS_REGEX: OnceLock<Regex> = OnceLock::new();
11
12impl HTMLElement {
13	pub fn attrs_lower_decoded(&mut self) -> HashMap<String, String> {
14		self.ensure_lower_decoded();
15		self.cache_lower_decoded.clone().unwrap_or_default()
16	}
17
18	pub fn set_attributes(&mut self, attributes: &[(String, String)]) {
19		// 重建 raw_attrs 与 attrs(attrs 的 key 需小写且解码,这里假设传入 value 已为未转义文本,与 JS 行为接近)
20		self.attrs = attributes
21			.iter()
22			.map(|(k, v)| (k.to_lowercase(), v.clone()))
23			.collect();
24		self.raw_attrs = attributes
25			.iter()
26			.map(|(k, v)| {
27				// JS setAttributes: treats raw value 'null' OR '""' OR empty as boolean attribute (only name)
28				if v.is_empty() || v == "null" || v == "\"\"" {
29					k.clone()
30				} else {
31					format!("{}={}", k, quote_attribute(v))
32				}
33			})
34			.collect::<Vec<_>>()
35			.join(" ");
36		self.cache_raw_map = None;
37		self.cache_lower_decoded = None;
38		// 更新 id / class cache
39		if let Some((_, idv)) = self.attrs.iter().find(|(kk, _)| kk == "id") {
40			self.id = idv.clone();
41		}
42		if self.attrs.iter().any(|(kk, _)| kk == "class") {
43			self.class_cache = None;
44		}
45	}
46	pub fn remove_attribute(&mut self, key: &str) {
47		self.build_raw_cache();
48		let mut raw_map = self.cache_raw_map.take().unwrap_or_default();
49		let target = key.to_lowercase();
50		raw_map.retain(|k, _| k.to_lowercase() != target);
51		// sync structured attrs vector
52		self.attrs.retain(|(kk, _)| kk != &target);
53		self.raw_attrs = raw_map
54			.iter()
55			.map(|(k, v)| {
56				if v.is_empty() {
57					k.clone()
58				} else {
59					format!("{}={}", k, quote_attribute(v))
60				}
61			})
62			.collect::<Vec<_>>()
63			.join(" ");
64		self.cache_raw_map = None;
65		self.cache_lower_decoded = None;
66		if target == "id" {
67			self.id.clear();
68		}
69		if target == "class" {
70			self.class_cache = None;
71		}
72		self.attrs_complete = true; // attrs now reflect full set
73		self.attrs_modified = true; // Mark attributes as modified
74	}
75
76	pub fn get_attr(&self, key: &str) -> Option<&str> {
77		// First try already parsed attributes
78		let k = key.to_lowercase();
79		if let Some(found) = self.attrs.iter().find(|(kk, _)| *kk == k) {
80			return Some(found.1.as_str());
81		}
82
83		// If not found and attrs not complete, we need to ensure parsing
84		if !self.attrs_complete && !self.raw_attrs.is_empty() {
85			// Use unsafe to trigger ensure_all_attrs on self
86			let mut_ptr = self as *const HTMLElement as *mut HTMLElement;
87			unsafe {
88				(*mut_ptr).ensure_all_attrs();
89				// Now search again in the updated attrs
90				return (*mut_ptr)
91					.attrs
92					.iter()
93					.find(|(kk, _)| *kk == k)
94					.map(|(_, v)| v.as_str());
95			}
96		}
97
98		None
99	}
100	pub fn has_attr(&self, key: &str) -> bool {
101		self.get_attr(key).is_some()
102	}
103
104	pub fn set_attr(&mut self, key: &str, val: &str) {
105		let k = key.to_lowercase();
106		if let Some(kv) = self.attrs.iter_mut().find(|(kk, _)| *kk == k) {
107			kv.1 = val.to_string();
108		} else {
109			self.attrs.push((k, val.to_string()));
110		}
111		self.rebuild_raw_attrs();
112		self.cache_raw_map = None;
113		self.cache_lower_decoded = None;
114		if key.eq_ignore_ascii_case("id") {
115			self.id = val.to_string();
116		}
117	}
118	pub fn remove_attr(&mut self, key: &str) {
119		let k = key.to_lowercase();
120		self.attrs.retain(|(kk, _)| *kk != k);
121		self.rebuild_raw_attrs();
122		self.cache_raw_map = None;
123		self.cache_lower_decoded = None;
124		if k == "id" {
125			self.id.clear();
126		}
127	}
128	/// Convenience: remove the id attribute (safe wrapper for tests parity with JS removeAttribute('id'))
129	pub fn remove_id(&mut self) {
130		self.remove_attribute("id");
131	}
132	/// Convenience: set id attribute (safe wrapper to avoid direct raw mutation in tests)
133	pub fn set_id(&mut self, id: &str) {
134		self.set_attribute("id", id);
135	}
136	pub(super) fn rebuild_raw_attrs(&mut self) {
137		// 保持原有顺序,使用与 JS Quote 逻辑更接近的方式(参见 nodes/html.ts quoteAttribute)
138		fn quote_attr(src: &str) -> String {
139			if src.is_empty() || src == "null" {
140				return src.to_string();
141			}
142			// 先替换双引号
143			let replaced = src.replace('"', "&quot;");
144			// 模拟 JS: JSON.stringify 然后还原制表/换行/回车并移除反斜杠
145			let jsoned =
146				serde_json::to_string(&replaced).unwrap_or_else(|_| format!("\"{}\"", replaced));
147			// jsoned 形如 "...",去掉外层引号后处理内部转义
148			let inner = jsoned.trim_matches('"');
149			let re_ctrl = QUOTE_ESCAPED_CTRL_REGEX
150				.get_or_init(|| Regex::new(r"([^\\])\\([tnr])").unwrap());
151			let re_bs =
152				QUOTE_STRIP_BS_REGEX.get_or_init(|| Regex::new(r"([^\\])\\").unwrap());
153			let inner = re_ctrl
154				.replace_all(inner, |caps: &regex::Captures| match &caps[2] {
155					"t" => format!("{}\t", &caps[1]),
156					"n" => format!("{}\n", &caps[1]),
157					"r" => format!("{}\r", &caps[1]),
158					_ => caps[0].to_string(),
159				})
160				.to_string();
161			let inner = re_bs.replace_all(&inner, "$1").to_string();
162			format!("\"{}\"", inner)
163		}
164		self.raw_attrs = self
165			.attrs
166			.iter()
167			.map(|(k, v)| {
168				if v.is_empty() {
169					k.clone()
170				} else {
171					format!("{}={}", k, quote_attr(v))
172				}
173			})
174			.collect::<Vec<_>>()
175			.join(" ");
176	}
177
178	pub fn attributes(&mut self) -> std::collections::HashMap<String, String> {
179		// JS: Element.attributes preserves original attribute name casing/order (first occurrence) while returning decoded values.
180		// We approximate with a HashMap (order not guaranteed) but keep original key casing from raw parsing.
181		self.build_raw_cache();
182		let mut out = std::collections::HashMap::new();
183		if let Some(raw) = &self.cache_raw_map {
184			for (orig_k, raw_v) in raw.iter() {
185				let decoded = html_escape::decode_html_entities(raw_v).to_string();
186				// Insert only if absent (first occurrence wins) – raw_map already keeps first, so direct insert.
187				out.insert(orig_k.clone(), decoded);
188			}
189		}
190		out
191	}
192	pub fn raw_attributes(&mut self) -> HashMap<String, String> {
193		self.build_raw_cache();
194		self.cache_raw_map.clone().unwrap_or_default()
195	}
196	/// Read-only snapshot of the original raw attribute string (public accessor for tests like issue 136)
197	pub fn raw_attrs_str(&self) -> &str {
198		&self.raw_attrs
199	}
200
201	pub fn get_attribute(&mut self, key: &str) -> Option<String> {
202		self.ensure_lower_decoded();
203		self.cache_lower_decoded
204			.as_ref()
205			.unwrap()
206			.get(&key.to_lowercase())
207			.cloned()
208	}
209
210	pub fn set_attribute(&mut self, key: &str, value: &str) {
211		// Update raw_attrs string representation, preserving original attribute order
212		let quoted_value = if value.is_empty() {
213			None
214		} else {
215			Some(quote_attribute(value))
216		};
217
218		if self.raw_attrs.is_empty() {
219			if let Some(qv) = quoted_value {
220				self.raw_attrs = format!("{}={}", key, qv);
221			} else {
222				self.raw_attrs = key.to_string();
223			}
224		} else {
225			// Parse existing attributes to preserve order
226			let re = ATTR_PARSE_REGEX.get_or_init(|| {
227				regex::Regex::new(
228					r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?"#,
229				)
230				.unwrap()
231			});
232
233			let mut result_attrs = Vec::new();
234			let mut found = false;
235
236			for cap in re.captures_iter(&self.raw_attrs) {
237				let existing_key = cap.get(1).unwrap().as_str();
238				if existing_key.eq_ignore_ascii_case(key) {
239					// Replace this attribute, preserve original case
240					if let Some(qv) = &quoted_value {
241						result_attrs.push(format!("{}={}", existing_key, qv));
242					} else {
243						result_attrs.push(existing_key.to_string());
244					}
245					found = true;
246				} else {
247					// Keep existing attribute as-is
248					let existing_val = cap.get(2).map(|m| m.as_str()).unwrap_or("");
249					if existing_val.is_empty() {
250						result_attrs.push(existing_key.to_string());
251					} else {
252						result_attrs.push(format!("{}={}", existing_key, existing_val));
253					}
254				}
255			}
256
257			// If not found, add at the end
258			if !found {
259				if let Some(qv) = quoted_value {
260					result_attrs.push(format!("{}={}", key, qv));
261				} else {
262					result_attrs.push(key.to_string());
263				}
264			}
265
266			self.raw_attrs = result_attrs.join(" ");
267		}
268
269		// Update structured attrs with decoded value
270		self.ensure_all_attrs();
271		let lk = key.to_lowercase();
272		let decoded_val = html_escape::decode_html_entities(value).to_string();
273		if let Some(kv) = self.attrs.iter_mut().find(|(k, _)| *k == lk) {
274			kv.1 = decoded_val;
275		} else {
276			self.attrs.push((lk, decoded_val));
277		}
278
279		// Clear caches to force rebuild
280		self.cache_raw_map = None;
281		self.cache_lower_decoded = None;
282		self.attrs_complete = true;
283		self.attrs_modified = true; // Mark attributes as modified
284
285		// Update element-specific caches
286		if key.eq_ignore_ascii_case("id") {
287			self.id = value.to_string();
288		}
289		if key.eq_ignore_ascii_case("class") {
290			self.class_cache = None;
291		}
292	}
293
294	pub fn has_attribute(&mut self, key: &str) -> bool {
295		self.ensure_lower_decoded();
296		self.cache_lower_decoded
297			.as_ref()
298			.unwrap()
299			.contains_key(&key.to_lowercase())
300	}
301
302	pub(crate) fn ensure_all_attrs(&mut self) {
303		if self.attrs_complete {
304			return;
305		}
306
307		// Clear existing attrs and rebuild from raw_attrs string
308		self.attrs.clear();
309		self.build_raw_cache();
310		if let Some(ref raw_map) = self.cache_raw_map {
311			for (key, value) in raw_map.iter() {
312				let decoded_val = html_escape::decode_html_entities(value).to_string();
313				self.attrs.push((key.to_lowercase(), decoded_val));
314			}
315		}
316
317		self.attrs_complete = true;
318	}
319	fn build_raw_cache(&mut self) {
320		if self.cache_raw_map.is_some() {
321			return;
322		}
323
324		let mut map = HashMap::new();
325		if !self.raw_attrs.is_empty() {
326			let re = ATTR_PARSE_REGEX.get_or_init(|| {
327				regex::Regex::new(
328					r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?"#,
329				)
330				.unwrap()
331			});
332			for cap in re.captures_iter(&self.raw_attrs) {
333				let key = cap.get(1).unwrap().as_str();
334				let mut val = cap.get(2).map(|m| m.as_str()).unwrap_or("").to_string();
335				if !val.is_empty() {
336					if (val.starts_with('\"') && val.ends_with('\"'))
337						|| (val.starts_with('\'') && val.ends_with('\''))
338					{
339						val = val[1..val.len() - 1].to_string();
340					}
341				}
342				// only first occurrence kept (JS behavior)
343				map.entry(key.to_string()).or_insert(val);
344			}
345		}
346		self.cache_raw_map = Some(map);
347	}
348
349	fn ensure_lower_decoded(&mut self) {
350		if self.cache_lower_decoded.is_some() {
351			return;
352		}
353
354		self.build_raw_cache();
355		let mut lower_decoded = HashMap::new();
356
357		if let Some(ref raw_map) = self.cache_raw_map {
358			for (key, value) in raw_map.iter() {
359				let decoded_val = html_escape::decode_html_entities(value).to_string();
360				let lower_key = key.to_lowercase();
361				lower_decoded.insert(lower_key, decoded_val);
362			}
363		}
364
365		self.cache_lower_decoded = Some(lower_decoded);
366	}
367}
368
369fn quote_attribute(val: &str) -> String {
370	if val.is_empty() {
371		return val.to_string();
372	}
373	let replaced = val.replace('"', "&quot;");
374	let jsoned = serde_json::to_string(&replaced).unwrap_or_else(|_| format!("\"{}\"", replaced));
375	let inner = jsoned.trim_matches('"');
376	let inner = inner
377		.replace("\\t", "\t")
378		.replace("\\n", "\n")
379		.replace("\\r", "\r")
380		.replace('\\', "");
381	format!("\"{}\"", inner)
382}