node_html_parser/dom/element/
main.rs1use super::content::parse_fragment;
2use crate::dom::{node::Node, text::TextNode};
3use regex::Regex;
4use std::collections::HashMap;
5use std::fmt;
6
7#[cfg(feature = "parallel")]
8use rayon::prelude::*;
9
10#[derive(Debug, Clone)]
11pub struct HTMLElement {
12 pub(super) tag_name: Option<String>, pub(crate) raw_attrs: String, pub attrs: Vec<(String, String)>, pub children: Vec<Node>,
16 pub(crate) parent: Option<*mut HTMLElement>,
17 pub(super) is_void: bool,
19 pub(super) void_add_slash: bool,
21 pub(super) cache_raw_map: Option<HashMap<String, String>>, pub(super) cache_lower_decoded: Option<HashMap<String, String>>, pub id: String,
25 pub(super) class_cache: Option<Vec<String>>, pub(super) range: Option<(usize, usize)>, pub(crate) attrs_complete: bool,
29 pub(crate) attrs_modified: bool,
31 pub(crate) parse_comment: bool,
32 pub(crate) parse_lowercase: bool,
33}
34
35impl HTMLElement {
36 pub fn new(
37 tag: Option<String>,
38 raw_attrs: String,
39 attrs: Vec<(String, String)>,
40 is_void: bool,
41 void_add_slash: bool,
42 ) -> Self {
43 let mut id_val = String::new();
45 for (k, v) in &attrs {
46 if k.eq_ignore_ascii_case("id") {
47 id_val = v.clone();
48 break;
49 }
50 }
51 Self {
52 tag_name: tag,
53 raw_attrs,
54 attrs,
55 children: Vec::with_capacity(2),
57 parent: None,
58 is_void,
59 void_add_slash,
60 cache_raw_map: None,
61 cache_lower_decoded: None,
62
63 id: id_val,
64 class_cache: None,
65 range: None, attrs_complete: false,
67 attrs_modified: false,
68 parse_comment: false,
69 parse_lowercase: false,
70 }
71 }
72 pub fn is_root(&self) -> bool {
74 self.tag_name.is_none()
75 }
76 pub fn name(&self) -> &str {
77 self.tag_name.as_deref().unwrap_or("")
78 }
79 pub fn set_tag_name(&mut self, new_name: &str) {
82 let lowered = new_name.to_lowercase();
83 self.tag_name = Some(lowered);
84 }
85
86 pub fn raw_text(&self) -> String {
88 if !self.is_root() && self.name().eq_ignore_ascii_case("br") {
90 return "\n".to_string();
91 }
92 let mut buf = String::new();
93 for c in &self.children {
94 buf.push_str(&c.raw_text());
95 }
96 buf
97 }
98
99 pub fn class_names(&self) -> String {
100 self.get_attr("class").unwrap_or("").to_string()
101 }
102 pub fn inner_html(&self) -> String {
103 self.children.iter().map(|c| c.to_html()).collect()
104 }
105 pub fn set_inner_html(&mut self, html: &str) {
107 let mut nodes = parse_fragment(html);
108 if nodes.is_empty() {
109 nodes.push(Node::Text(TextNode::new(html.to_string())));
111 }
112 self.children.clear();
113 let self_ptr: *mut HTMLElement = self as *mut HTMLElement;
114 for n in nodes.iter_mut() {
115 if let Node::Element(e) = n {
116 e.parent = Some(self_ptr);
117 }
118 }
119 self.children.extend(nodes);
120 }
121
122 pub fn matches_selector<'a>(&'a self, root: &'a HTMLElement, selector: &str) -> bool {
125 let matches = root.query_selector_all(selector);
127 let self_ptr = self as *const HTMLElement;
128 matches.iter().any(|e| *e as *const HTMLElement == self_ptr)
129 }
130 pub fn matches(&self, selector: &str) -> bool {
132 let root = self.root();
133 self.matches_selector(root, selector)
134 }
135 pub fn root(&self) -> &HTMLElement {
137 let mut cur: &HTMLElement = self;
138 while let Some(p) = cur.parent() {
139 cur = p;
140 }
141 cur
142 }
143 pub fn closest(&self, selector: &str) -> Option<&HTMLElement> {
145 let mut cur: Option<&HTMLElement> = Some(self);
146 while let Some(c) = cur {
147 if c.matches(selector) {
148 return Some(c);
149 }
150 cur = c.parent();
151 }
152 None
153 }
154 pub fn clone(&self) -> HTMLElement {
156 self.clone_node()
157 }
158
159 pub fn iter_elements<'a>(&'a self) -> impl Iterator<Item = &'a HTMLElement> + 'a {
160 self.children.iter().filter_map(|n| n.as_element())
161 }
162 pub fn query_selector_all<'a>(&'a self, selector: &str) -> Vec<&'a HTMLElement> {
163 crate::css_select::select_all(selector, self)
164 }
165 pub fn query_selector<'a>(&'a self, selector: &str) -> Option<&'a HTMLElement> {
166 self.query_selector_all(selector).into_iter().next()
167 }
168
169 pub fn remove_whitespace(&mut self) {
170 self.ensure_all_attrs();
172 let mut out = Vec::with_capacity(self.children.len());
173 for mut child in self.children.drain(..) {
174 match &mut child {
175 Node::Text(t) => {
176 let mut t2 = t.clone();
177 if !t2.is_whitespace() {
178 let new_raw = {
179 let _ = t2.trimmed_raw_text();
180 t2.trimmed_raw_text().to_string()
181 };
182 t2.set_raw(new_raw);
183 out.push(Node::Text(t2));
184 }
185 }
186 Node::Element(e) => {
187 let mut ec = e.clone();
188 ec.remove_whitespace();
189 out.push(Node::Element(ec));
190 }
191 Node::Comment(_) => {}
192 }
193 }
194 self.children = out;
195 self.rebuild_raw_attrs();
196 }
197
198 pub fn trim_right(&mut self, pattern: &Regex) {
200 let mut i = 0usize;
201 while i < self.children.len() {
202 match &mut self.children[i] {
203 Node::Element(e) => {
204 let mut ec = e.clone();
205 ec.trim_right(pattern);
206 self.children[i] = Node::Element(ec);
207 }
208 Node::Text(t) => {
209 if let Some(mat) = pattern.find(&t.raw) {
210 let new_raw = t.raw[..mat.start()].to_string();
211 let mut nt = t.clone();
212 nt.set_raw(new_raw);
213 self.children[i] = Node::Text(nt);
214 self.children.truncate(i + 1); return;
216 }
217 }
218 Node::Comment(_) => {}
219 }
220 i += 1;
221 }
222 }
223
224 pub fn structure(&self) -> String {
226 let mut res = Vec::new();
227 fn dfs(cur: &HTMLElement, indent: usize, out: &mut Vec<String>) {
228 if cur.is_root() {
229 for child in &cur.children {
230 if let Node::Element(e) = child {
231 dfs(e, 0, out);
232 }
233 }
234 return;
235 }
236 let mut line = String::new();
237 line.push_str(&" ".repeat(indent));
238 line.push_str(cur.name());
239 if !cur.id.is_empty() {
240 line.push('#');
241 line.push_str(&cur.id);
242 }
243 if let Some(cls) = cur.get_attr("class") {
244 if !cls.is_empty() {
245 let mut seen = std::collections::HashSet::new();
247 for c in cls.split_whitespace() {
248 if seen.insert(c) {
249 line.push('.');
250 line.push_str(c);
251 }
252 }
253 }
254 }
255 out.push(line);
256 for child in &cur.children {
257 match child {
258 Node::Element(e) => dfs(e, indent + 1, out),
259 Node::Text(t) => {
260 if !t.is_whitespace() {
261 out.push(format!("{}#text", " ".repeat(indent + 1)));
262 }
263 }
264 Node::Comment(_) => {}
265 }
266 }
267 }
268 dfs(self, 0, &mut res);
269 res.join("\n")
270 }
271 pub fn get_elements_by_tag_name<'a>(&'a self, tag: &str) -> Vec<&'a HTMLElement> {
272 let tgt = tag.to_lowercase();
273 let mut acc = Vec::new();
274 fn walk<'b>(cur: &'b HTMLElement, tgt: &str, acc: &mut Vec<&'b HTMLElement>) {
275 for c in &cur.children {
276 if let Node::Element(e) = c {
277 let inner = &**e;
278 if tgt == "*" || inner.name().eq_ignore_ascii_case(tgt) {
279 acc.push(inner);
280 }
281 walk(inner, tgt, acc);
282 }
283 }
284 }
285 walk(self, &tgt, &mut acc);
286 acc
287 }
288 pub fn get_element_by_id<'a>(&'a self, id: &str) -> Option<&'a HTMLElement> {
289 fn walk<'b>(cur: &'b HTMLElement, id: &str) -> Option<&'b HTMLElement> {
290 for c in &cur.children {
291 if let Node::Element(e) = c {
292 let inner = &**e;
293 if inner.get_attr("id") == Some(id) {
294 return Some(inner);
295 }
296 if let Some(f) = walk(inner, id) {
297 return Some(f);
298 }
299 }
300 }
301 None
302 }
303 walk(self, id)
304 }
305 pub fn get_element_by_id_mut<'a>(&'a mut self, id: &str) -> Option<&'a mut HTMLElement> {
306 fn walk<'b>(cur: &'b mut HTMLElement, id: &str) -> Option<&'b mut HTMLElement> {
307 for c in cur.children.iter_mut() {
308 if let Node::Element(e) = c {
309 if e.id == id || e.get_attr("id") == Some(id) {
311 return Some(e);
312 }
313 if let Some(found) = walk(e, id) {
314 return Some(found);
315 }
316 }
317 }
318 None
319 }
320 walk(self, id)
321 }
322 pub fn clone_node(&self) -> HTMLElement {
323 fn clone_rec(el: &HTMLElement) -> Box<HTMLElement> {
324 let mut new = Box::new(HTMLElement {
325 tag_name: el.tag_name.clone(),
326 raw_attrs: el.raw_attrs.clone(),
327 attrs: el.attrs.clone(),
328 children: Vec::new(),
329 parent: None,
330 is_void: el.is_void,
331 void_add_slash: el.void_add_slash,
332 cache_raw_map: None,
333 cache_lower_decoded: None,
334
335 id: el.id.clone(),
336 class_cache: el.class_cache.clone(),
337 range: None,
338 attrs_complete: el.attrs_complete,
339 attrs_modified: el.attrs_modified,
340 parse_comment: el.parse_comment,
341 parse_lowercase: el.parse_lowercase,
342 });
343 for c in &el.children {
344 match c {
345 Node::Element(e) => new.children.push(Node::Element(clone_rec(e))),
346 Node::Text(t) => new.children.push(Node::Text(t.clone())),
347 Node::Comment(cm) => new.children.push(Node::Comment(cm.clone())),
348 };
349 }
350 new
351 }
352 *clone_rec(self)
353 }
354 pub fn clone_shallow(&self) -> HTMLElement {
356 HTMLElement {
357 tag_name: self.tag_name.clone(),
358 raw_attrs: self.raw_attrs.clone(),
359 attrs: self.attrs.clone(),
360 children: Vec::new(),
361 parent: None,
362 is_void: self.is_void,
363 void_add_slash: self.void_add_slash,
364 cache_raw_map: None,
365 cache_lower_decoded: None,
366
367 id: self.id.clone(),
368 class_cache: self.class_cache.clone(),
369 range: None,
370 attrs_complete: self.attrs_complete,
371 attrs_modified: self.attrs_modified,
372 parse_comment: self.parse_comment,
373 parse_lowercase: self.parse_lowercase,
374 }
375 }
376 pub fn set_range_start(&mut self, start: usize) {
377 match self.range {
378 Some((_, e)) => self.range = Some((start, e)),
379 None => self.range = Some((start, start)),
380 }
381 }
382 pub fn set_range_end(&mut self, end: usize) {
383 match self.range {
384 Some((s, _)) => self.range = Some((s, end)),
385 None => self.range = Some((end, end)),
386 }
387 }
388 pub fn range(&self) -> Option<(usize, usize)> {
389 self.range
390 }
391
392 #[cfg(feature = "parallel")]
395 pub fn batch_ensure_attributes_safe(elements: &mut [HTMLElement]) {
396 for el in elements.iter_mut() {
398 el.ensure_all_attrs();
399 }
400 }
401
402 #[cfg(feature = "parallel")]
404 pub fn process_text_nodes_parallel(text_nodes: &mut [crate::dom::text::TextNode]) {
405 const PARALLEL_THRESHOLD: usize = 20;
406
407 if text_nodes.len() >= PARALLEL_THRESHOLD {
408 text_nodes.par_iter_mut().for_each(|node| {
409 let _ = node.is_whitespace();
411 let _ = node.trimmed_raw_text();
412 });
413 }
414 }
415}
416
417impl fmt::Display for HTMLElement {
418 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
419 write!(f, "{}", self.outer_html())
420 }
421}