node_html_parser/dom/element/
main.rs1use super::content::parse_fragment;
2use crate::dom::{node::Node, text::TextNode};
3use regex::Regex;
4use std::collections::HashMap;
5use std::fmt;
6
7#[derive(Debug, Clone)]
8pub struct HTMLElement {
9 pub(super) tag_name: Option<String>, pub(crate) raw_attrs: String, pub attrs: Vec<(String, String)>, pub children: Vec<Node>,
13 pub(crate) parent: Option<*mut HTMLElement>,
14 pub(super) is_void: bool,
16 pub(super) void_add_slash: bool,
18 pub(super) cache_raw_map: Option<HashMap<String, String>>, pub(super) cache_lower_decoded: Option<HashMap<String, String>>, pub id: String,
22 pub(super) class_cache: Option<Vec<String>>, pub(super) range: Option<(usize, usize)>, pub(crate) attrs_complete: bool,
26 pub(crate) parse_comment: bool,
27 pub(crate) parse_lowercase: bool,
28}
29
30impl HTMLElement {
31 pub fn new(
32 tag: Option<String>,
33 raw_attrs: String,
34 attrs: Vec<(String, String)>,
35 is_void: bool,
36 void_add_slash: bool,
37 ) -> Self {
38 let mut id_val = String::new();
40 for (k, v) in &attrs {
41 if k.eq_ignore_ascii_case("id") {
42 id_val = v.clone();
43 break;
44 }
45 }
46 Self {
47 tag_name: tag,
48 raw_attrs,
49 attrs,
50 children: Vec::new(),
51 parent: None,
52 is_void,
53 void_add_slash,
54 cache_raw_map: None,
55 cache_lower_decoded: None,
56 id: id_val,
57 class_cache: None,
58 range: None, attrs_complete: false,
60 parse_comment: false,
61 parse_lowercase: false,
62 }
63 }
64 pub fn is_root(&self) -> bool {
66 self.tag_name.is_none()
67 }
68 pub fn name(&self) -> &str {
69 self.tag_name.as_deref().unwrap_or("")
70 }
71 pub fn set_tag_name(&mut self, new_name: &str) {
74 let lowered = new_name.to_lowercase();
75 self.tag_name = Some(lowered);
76 }
77
78 pub fn raw_text(&self) -> String {
80 if !self.is_root() && self.name().eq_ignore_ascii_case("br") {
82 return "\n".to_string();
83 }
84 let mut buf = String::new();
85 for c in &self.children {
86 buf.push_str(&c.raw_text());
87 }
88 buf
89 }
90
91 pub fn class_names(&self) -> String {
92 self.get_attr("class").unwrap_or("").to_string()
93 }
94 pub fn inner_html(&self) -> String {
95 self.children.iter().map(|c| c.to_html()).collect()
96 }
97 pub fn set_inner_html(&mut self, html: &str) {
99 let mut nodes = parse_fragment(html);
100 if nodes.is_empty() {
101 nodes.push(Node::Text(TextNode::new(html.to_string())));
103 }
104 self.children.clear();
105 let self_ptr: *mut HTMLElement = self as *mut HTMLElement;
106 for n in nodes.iter_mut() {
107 if let Node::Element(e) = n {
108 e.parent = Some(self_ptr);
109 }
110 }
111 self.children.extend(nodes);
112 }
113
114 pub fn matches_selector<'a>(&'a self, root: &'a HTMLElement, selector: &str) -> bool {
117 let matches = root.query_selector_all(selector);
119 let self_ptr = self as *const HTMLElement;
120 matches.iter().any(|e| *e as *const HTMLElement == self_ptr)
121 }
122 pub fn matches(&self, selector: &str) -> bool {
124 let root = self.root();
125 self.matches_selector(root, selector)
126 }
127 pub fn root(&self) -> &HTMLElement {
129 let mut cur: &HTMLElement = self;
130 while let Some(p) = cur.parent() {
131 cur = p;
132 }
133 cur
134 }
135 pub fn closest(&self, selector: &str) -> Option<&HTMLElement> {
137 let mut cur: Option<&HTMLElement> = Some(self);
138 while let Some(c) = cur {
139 if c.matches(selector) {
140 return Some(c);
141 }
142 cur = c.parent();
143 }
144 None
145 }
146 pub fn clone(&self) -> HTMLElement {
148 self.clone_node()
149 }
150
151 pub fn iter_elements<'a>(&'a self) -> impl Iterator<Item = &'a HTMLElement> + 'a {
152 self.children.iter().filter_map(|n| n.as_element())
153 }
154 pub fn query_selector_all<'a>(&'a self, selector: &str) -> Vec<&'a HTMLElement> {
155 crate::css_select::select_all(selector, self)
156 }
157 pub fn query_selector<'a>(&'a self, selector: &str) -> Option<&'a HTMLElement> {
158 self.query_selector_all(selector).into_iter().next()
159 }
160
161 pub fn remove_whitespace(&mut self) {
162 self.ensure_all_attrs();
164 let mut out = Vec::with_capacity(self.children.len());
165 for mut child in self.children.drain(..) {
166 match &mut child {
167 Node::Text(t) => {
168 let mut t2 = t.clone();
169 if !t2.is_whitespace() {
170 let new_raw = {
171 let _ = t2.trimmed_raw_text();
172 t2.trimmed_raw_text().to_string()
173 };
174 t2.set_raw(new_raw);
175 out.push(Node::Text(t2));
176 }
177 }
178 Node::Element(e) => {
179 let mut ec = e.clone();
180 ec.remove_whitespace();
181 out.push(Node::Element(ec));
182 }
183 Node::Comment(_) => {}
184 }
185 }
186 self.children = out;
187 self.rebuild_raw_attrs();
188 }
189
190 pub fn trim_right(&mut self, pattern: &Regex) {
192 let mut i = 0usize;
193 while i < self.children.len() {
194 match &mut self.children[i] {
195 Node::Element(e) => {
196 let mut ec = e.clone();
197 ec.trim_right(pattern);
198 self.children[i] = Node::Element(ec);
199 }
200 Node::Text(t) => {
201 if let Some(mat) = pattern.find(&t.raw) {
202 let new_raw = t.raw[..mat.start()].to_string();
203 let mut nt = t.clone();
204 nt.set_raw(new_raw);
205 self.children[i] = Node::Text(nt);
206 self.children.truncate(i + 1); return;
208 }
209 }
210 Node::Comment(_) => {}
211 }
212 i += 1;
213 }
214 }
215
216 pub fn structure(&self) -> String {
218 let mut res = Vec::new();
219 fn dfs(cur: &HTMLElement, indent: usize, out: &mut Vec<String>) {
220 if cur.is_root() {
221 for child in &cur.children {
222 if let Node::Element(e) = child {
223 dfs(e, 0, out);
224 }
225 }
226 return;
227 }
228 let mut line = String::new();
229 line.push_str(&" ".repeat(indent));
230 line.push_str(cur.name());
231 if !cur.id.is_empty() {
232 line.push('#');
233 line.push_str(&cur.id);
234 }
235 if let Some(cls) = cur.get_attr("class") {
236 if !cls.is_empty() {
237 let mut seen = std::collections::HashSet::new();
239 for c in cls.split_whitespace() {
240 if seen.insert(c) {
241 line.push('.');
242 line.push_str(c);
243 }
244 }
245 }
246 }
247 out.push(line);
248 for child in &cur.children {
249 match child {
250 Node::Element(e) => dfs(e, indent + 1, out),
251 Node::Text(t) => {
252 if !t.is_whitespace() {
253 out.push(format!("{}#text", " ".repeat(indent + 1)));
254 }
255 }
256 Node::Comment(_) => {}
257 }
258 }
259 }
260 dfs(self, 0, &mut res);
261 res.join("\n")
262 }
263 pub fn get_elements_by_tag_name<'a>(&'a self, tag: &str) -> Vec<&'a HTMLElement> {
264 let tgt = tag.to_lowercase();
265 let mut acc = Vec::new();
266 fn walk<'b>(cur: &'b HTMLElement, tgt: &str, acc: &mut Vec<&'b HTMLElement>) {
267 for c in &cur.children {
268 if let Node::Element(e) = c {
269 let inner = &**e;
270 if tgt == "*" || inner.name().eq_ignore_ascii_case(tgt) {
271 acc.push(inner);
272 }
273 walk(inner, tgt, acc);
274 }
275 }
276 }
277 walk(self, &tgt, &mut acc);
278 acc
279 }
280 pub fn get_element_by_id<'a>(&'a self, id: &str) -> Option<&'a HTMLElement> {
281 fn walk<'b>(cur: &'b HTMLElement, id: &str) -> Option<&'b HTMLElement> {
282 for c in &cur.children {
283 if let Node::Element(e) = c {
284 let inner = &**e;
285 if inner.get_attr("id") == Some(id) {
286 return Some(inner);
287 }
288 if let Some(f) = walk(inner, id) {
289 return Some(f);
290 }
291 }
292 }
293 None
294 }
295 walk(self, id)
296 }
297 pub fn get_element_by_id_mut<'a>(&'a mut self, id: &str) -> Option<&'a mut HTMLElement> {
298 fn walk<'b>(cur: &'b mut HTMLElement, id: &str) -> Option<&'b mut HTMLElement> {
299 for c in cur.children.iter_mut() {
300 if let Node::Element(e) = c {
301 if e.id == id || e.get_attr("id") == Some(id) {
303 return Some(e);
304 }
305 if let Some(found) = walk(e, id) {
306 return Some(found);
307 }
308 }
309 }
310 None
311 }
312 walk(self, id)
313 }
314 pub fn clone_node(&self) -> HTMLElement {
315 fn clone_rec(el: &HTMLElement) -> Box<HTMLElement> {
316 let mut new = Box::new(HTMLElement {
317 tag_name: el.tag_name.clone(),
318 raw_attrs: el.raw_attrs.clone(),
319 attrs: el.attrs.clone(),
320 children: Vec::new(),
321 parent: None,
322 is_void: el.is_void,
323 void_add_slash: el.void_add_slash,
324 cache_raw_map: None,
325 cache_lower_decoded: None,
326 id: el.id.clone(),
327 class_cache: el.class_cache.clone(),
328 range: None,
329 attrs_complete: el.attrs_complete,
330 parse_comment: el.parse_comment,
331 parse_lowercase: el.parse_lowercase,
332 });
333 for c in &el.children {
334 match c {
335 Node::Element(e) => new.children.push(Node::Element(clone_rec(e))),
336 Node::Text(t) => new.children.push(Node::Text(t.clone())),
337 Node::Comment(cm) => new.children.push(Node::Comment(cm.clone())),
338 };
339 }
340 new
341 }
342 *clone_rec(self)
343 }
344 pub fn clone_shallow(&self) -> HTMLElement {
346 HTMLElement {
347 tag_name: self.tag_name.clone(),
348 raw_attrs: self.raw_attrs.clone(),
349 attrs: self.attrs.clone(),
350 children: Vec::new(),
351 parent: None,
352 is_void: self.is_void,
353 void_add_slash: self.void_add_slash,
354 cache_raw_map: None,
355 cache_lower_decoded: None,
356 id: self.id.clone(),
357 class_cache: self.class_cache.clone(),
358 range: None,
359 attrs_complete: self.attrs_complete,
360 parse_comment: self.parse_comment,
361 parse_lowercase: self.parse_lowercase,
362 }
363 }
364 pub fn set_range_start(&mut self, start: usize) {
365 match self.range {
366 Some((_, e)) => self.range = Some((start, e)),
367 None => self.range = Some((start, start)),
368 }
369 }
370 pub fn set_range_end(&mut self, end: usize) {
371 match self.range {
372 Some((s, _)) => self.range = Some((s, end)),
373 None => self.range = Some((end, end)),
374 }
375 }
376 pub fn range(&self) -> Option<(usize, usize)> {
377 self.range
378 }
379}
380
381impl fmt::Display for HTMLElement {
382 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
383 write!(f, "{}", self.outer_html())
384 }
385}