node_html_parser/dom/element/
attributes.rs1use super::main::HTMLElement;
2use regex::Regex;
3use std::collections::HashMap;
4use std::sync::OnceLock;
5
6static ATTR_PARSE_REGEX: OnceLock<Regex> = OnceLock::new();
8static QUOTE_ESCAPED_CTRL_REGEX: OnceLock<Regex> = OnceLock::new();
10static QUOTE_STRIP_BS_REGEX: OnceLock<Regex> = OnceLock::new();
11
12impl HTMLElement {
13 pub fn attrs_lower_decoded(&mut self) -> HashMap<String, String> {
14 self.ensure_lower_decoded();
15 self.cache_lower_decoded.clone().unwrap_or_default()
16 }
17
18 pub fn set_attributes(&mut self, attributes: &[(String, String)]) {
19 self.attrs = attributes
21 .iter()
22 .map(|(k, v)| (k.to_lowercase(), v.clone()))
23 .collect();
24 self.raw_attrs = attributes
25 .iter()
26 .map(|(k, v)| {
27 if v.is_empty() || v == "null" || v == "\"\"" {
29 k.clone()
30 } else {
31 format!("{}={}", k, quote_attribute(v))
32 }
33 })
34 .collect::<Vec<_>>()
35 .join(" ");
36 self.cache_raw_map = None;
37 self.cache_lower_decoded = None;
38 if let Some((_, idv)) = self.attrs.iter().find(|(kk, _)| kk == "id") {
40 self.id = idv.clone();
41 }
42 if self.attrs.iter().any(|(kk, _)| kk == "class") {
43 self.class_cache = None;
44 }
45 }
46 pub fn remove_attribute(&mut self, key: &str) {
47 self.build_raw_cache();
48 let mut raw_map = self.cache_raw_map.take().unwrap_or_default();
49 let target = key.to_lowercase();
50 raw_map.retain(|k, _| k.to_lowercase() != target);
51 self.attrs.retain(|(kk, _)| kk != &target);
53 self.raw_attrs = raw_map
54 .iter()
55 .map(|(k, v)| {
56 if v.is_empty() {
57 k.clone()
58 } else {
59 format!("{}={}", k, quote_attribute(v))
60 }
61 })
62 .collect::<Vec<_>>()
63 .join(" ");
64 self.cache_raw_map = None;
65 self.cache_lower_decoded = None;
66 if target == "id" {
67 self.id.clear();
68 }
69 if target == "class" {
70 self.class_cache = None;
71 }
72 self.attrs_complete = true; self.attrs_modified = true; }
75
76 pub fn get_attr(&self, key: &str) -> Option<&str> {
77 let k = key.to_lowercase();
79 if let Some(found) = self.attrs.iter().find(|(kk, _)| *kk == k) {
80 return Some(found.1.as_str());
81 }
82
83 if !self.attrs_complete && !self.raw_attrs.is_empty() {
85 let mut_ptr = self as *const HTMLElement as *mut HTMLElement;
87 unsafe {
88 (*mut_ptr).ensure_all_attrs();
89 return (*mut_ptr)
91 .attrs
92 .iter()
93 .find(|(kk, _)| *kk == k)
94 .map(|(_, v)| v.as_str());
95 }
96 }
97
98 None
99 }
100 pub fn has_attr(&self, key: &str) -> bool {
101 self.get_attr(key).is_some()
102 }
103
104 pub fn set_attr(&mut self, key: &str, val: &str) {
105 let k = key.to_lowercase();
106 if let Some(kv) = self.attrs.iter_mut().find(|(kk, _)| *kk == k) {
107 kv.1 = val.to_string();
108 } else {
109 self.attrs.push((k, val.to_string()));
110 }
111 self.rebuild_raw_attrs();
112 self.cache_raw_map = None;
113 self.cache_lower_decoded = None;
114 if key.eq_ignore_ascii_case("id") {
115 self.id = val.to_string();
116 }
117 }
118 pub fn remove_attr(&mut self, key: &str) {
119 let k = key.to_lowercase();
120 self.attrs.retain(|(kk, _)| *kk != k);
121 self.rebuild_raw_attrs();
122 self.cache_raw_map = None;
123 self.cache_lower_decoded = None;
124 if k == "id" {
125 self.id.clear();
126 }
127 }
128 pub fn remove_id(&mut self) {
130 self.remove_attribute("id");
131 }
132 pub fn set_id(&mut self, id: &str) {
134 self.set_attribute("id", id);
135 }
136 pub(super) fn rebuild_raw_attrs(&mut self) {
137 fn quote_attr(src: &str) -> String {
139 if src.is_empty() || src == "null" {
140 return src.to_string();
141 }
142 let replaced = src.replace('"', """);
144 let jsoned =
146 serde_json::to_string(&replaced).unwrap_or_else(|_| format!("\"{}\"", replaced));
147 let inner = jsoned.trim_matches('"');
149 let re_ctrl = QUOTE_ESCAPED_CTRL_REGEX
150 .get_or_init(|| Regex::new(r"([^\\])\\([tnr])").unwrap());
151 let re_bs =
152 QUOTE_STRIP_BS_REGEX.get_or_init(|| Regex::new(r"([^\\])\\").unwrap());
153 let inner = re_ctrl
154 .replace_all(inner, |caps: ®ex::Captures| match &caps[2] {
155 "t" => format!("{}\t", &caps[1]),
156 "n" => format!("{}\n", &caps[1]),
157 "r" => format!("{}\r", &caps[1]),
158 _ => caps[0].to_string(),
159 })
160 .to_string();
161 let inner = re_bs.replace_all(&inner, "$1").to_string();
162 format!("\"{}\"", inner)
163 }
164 self.raw_attrs = self
165 .attrs
166 .iter()
167 .map(|(k, v)| {
168 if v.is_empty() {
169 k.clone()
170 } else {
171 format!("{}={}", k, quote_attr(v))
172 }
173 })
174 .collect::<Vec<_>>()
175 .join(" ");
176 }
177
178 pub fn attributes(&mut self) -> std::collections::HashMap<String, String> {
179 self.build_raw_cache();
182 let mut out = std::collections::HashMap::new();
183 if let Some(raw) = &self.cache_raw_map {
184 for (orig_k, raw_v) in raw.iter() {
185 let decoded = html_escape::decode_html_entities(raw_v).to_string();
186 out.insert(orig_k.clone(), decoded);
188 }
189 }
190 out
191 }
192 pub fn raw_attributes(&mut self) -> HashMap<String, String> {
193 self.build_raw_cache();
194 self.cache_raw_map.clone().unwrap_or_default()
195 }
196 pub fn raw_attrs_str(&self) -> &str {
198 &self.raw_attrs
199 }
200
201 pub fn get_attribute(&mut self, key: &str) -> Option<String> {
202 self.ensure_lower_decoded();
203 self.cache_lower_decoded
204 .as_ref()
205 .unwrap()
206 .get(&key.to_lowercase())
207 .cloned()
208 }
209
210 pub fn set_attribute(&mut self, key: &str, value: &str) {
211 let quoted_value = if value.is_empty() {
213 None
214 } else {
215 Some(quote_attribute(value))
216 };
217
218 if self.raw_attrs.is_empty() {
219 if let Some(qv) = quoted_value {
220 self.raw_attrs = format!("{}={}", key, qv);
221 } else {
222 self.raw_attrs = key.to_string();
223 }
224 } else {
225 let re = ATTR_PARSE_REGEX.get_or_init(|| {
227 regex::Regex::new(
228 r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?"#,
229 )
230 .unwrap()
231 });
232
233 let mut result_attrs = Vec::new();
234 let mut found = false;
235
236 for cap in re.captures_iter(&self.raw_attrs) {
237 let existing_key = cap.get(1).unwrap().as_str();
238 if existing_key.eq_ignore_ascii_case(key) {
239 if let Some(qv) = "ed_value {
241 result_attrs.push(format!("{}={}", existing_key, qv));
242 } else {
243 result_attrs.push(existing_key.to_string());
244 }
245 found = true;
246 } else {
247 let existing_val = cap.get(2).map(|m| m.as_str()).unwrap_or("");
249 if existing_val.is_empty() {
250 result_attrs.push(existing_key.to_string());
251 } else {
252 result_attrs.push(format!("{}={}", existing_key, existing_val));
253 }
254 }
255 }
256
257 if !found {
259 if let Some(qv) = quoted_value {
260 result_attrs.push(format!("{}={}", key, qv));
261 } else {
262 result_attrs.push(key.to_string());
263 }
264 }
265
266 self.raw_attrs = result_attrs.join(" ");
267 }
268
269 self.ensure_all_attrs();
271 let lk = key.to_lowercase();
272 let decoded_val = html_escape::decode_html_entities(value).to_string();
273 if let Some(kv) = self.attrs.iter_mut().find(|(k, _)| *k == lk) {
274 kv.1 = decoded_val;
275 } else {
276 self.attrs.push((lk, decoded_val));
277 }
278
279 self.cache_raw_map = None;
281 self.cache_lower_decoded = None;
282 self.attrs_complete = true;
283 self.attrs_modified = true; if key.eq_ignore_ascii_case("id") {
287 self.id = value.to_string();
288 }
289 if key.eq_ignore_ascii_case("class") {
290 self.class_cache = None;
291 }
292 }
293
294 pub fn has_attribute(&mut self, key: &str) -> bool {
295 self.ensure_lower_decoded();
296 self.cache_lower_decoded
297 .as_ref()
298 .unwrap()
299 .contains_key(&key.to_lowercase())
300 }
301
302 pub(crate) fn ensure_all_attrs(&mut self) {
303 if self.attrs_complete {
304 return;
305 }
306
307 self.attrs.clear();
309 self.build_raw_cache();
310 if let Some(ref raw_map) = self.cache_raw_map {
311 for (key, value) in raw_map.iter() {
312 let decoded_val = html_escape::decode_html_entities(value).to_string();
313 self.attrs.push((key.to_lowercase(), decoded_val));
314 }
315 }
316
317 self.attrs_complete = true;
318 }
319 fn build_raw_cache(&mut self) {
320 if self.cache_raw_map.is_some() {
321 return;
322 }
323
324 let mut map = HashMap::new();
325 if !self.raw_attrs.is_empty() {
326 let re = ATTR_PARSE_REGEX.get_or_init(|| {
327 regex::Regex::new(
328 r#"([a-zA-Z()\[\]#@$.?:][a-zA-Z0-9-._:()\[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?"#,
329 )
330 .unwrap()
331 });
332 for cap in re.captures_iter(&self.raw_attrs) {
333 let key = cap.get(1).unwrap().as_str();
334 let mut val = cap.get(2).map(|m| m.as_str()).unwrap_or("").to_string();
335 if !val.is_empty() {
336 if (val.starts_with('\"') && val.ends_with('\"'))
337 || (val.starts_with('\'') && val.ends_with('\''))
338 {
339 val = val[1..val.len() - 1].to_string();
340 }
341 }
342 map.entry(key.to_string()).or_insert(val);
344 }
345 }
346 self.cache_raw_map = Some(map);
347 }
348
349 fn ensure_lower_decoded(&mut self) {
350 if self.cache_lower_decoded.is_some() {
351 return;
352 }
353
354 self.build_raw_cache();
355 let mut lower_decoded = HashMap::new();
356
357 if let Some(ref raw_map) = self.cache_raw_map {
358 for (key, value) in raw_map.iter() {
359 let decoded_val = html_escape::decode_html_entities(value).to_string();
360 let lower_key = key.to_lowercase();
361 lower_decoded.insert(lower_key, decoded_val);
362 }
363 }
364
365 self.cache_lower_decoded = Some(lower_decoded);
366 }
367}
368
369fn quote_attribute(val: &str) -> String {
370 if val.is_empty() {
371 return val.to_string();
372 }
373 let replaced = val.replace('"', """);
374 let jsoned = serde_json::to_string(&replaced).unwrap_or_else(|_| format!("\"{}\"", replaced));
375 let inner = jsoned.trim_matches('"');
376 let inner = inner
377 .replace("\\t", "\t")
378 .replace("\\n", "\n")
379 .replace("\\r", "\r")
380 .replace('\\', "");
381 format!("\"{}\"", inner)
382}