1use std::collections::HashMap;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
16pub struct NodeId(pub u32);
17
18impl NodeId {
19 pub const ROOT: Self = Self(0);
20}
21
22#[derive(Debug, Clone)]
24pub struct Node {
25 pub id: NodeId,
26 pub kind: NodeKind,
27 pub parent: Option<NodeId>,
28 pub children: Vec<NodeId>,
29 pub attributes: HashMap<String, String>,
30 pub style: HashMap<String, String>,
32 pub classes: Vec<String>,
34}
35
36#[derive(Debug, Clone)]
38pub enum NodeKind {
39 Document,
40 Element { tag: String },
41 Text { content: String },
42 Comment { content: String },
43}
44
45impl Node {
46 pub fn tag(&self) -> &str {
47 match &self.kind {
48 NodeKind::Element { tag } => tag,
49 NodeKind::Document => "#document",
50 NodeKind::Text { .. } => "#text",
51 NodeKind::Comment { .. } => "#comment",
52 }
53 }
54
55 pub fn is_element(&self) -> bool {
56 matches!(self.kind, NodeKind::Element { .. })
57 }
58}
59
60#[derive(Debug, Clone)]
62pub struct Dom {
63 nodes: Vec<Node>,
64 tag_index: HashMap<String, Vec<NodeId>>,
66 id_index: HashMap<String, NodeId>,
68}
69
70impl Default for Dom {
71 fn default() -> Self {
72 Self::empty()
73 }
74}
75
76impl Dom {
77 pub fn empty() -> Self {
79 let root = Node {
80 id: NodeId::ROOT,
81 kind: NodeKind::Document,
82 parent: None,
83 children: Vec::new(),
84 attributes: HashMap::new(),
85 style: HashMap::new(),
86 classes: Vec::new(),
87 };
88 Self {
89 nodes: vec![root],
90 tag_index: HashMap::new(),
91 id_index: HashMap::new(),
92 }
93 }
94
95 pub fn parse(html: &str) -> Self {
99 let mut dom = Self::empty();
100
101 let html_id = dom.create_element("html");
103 dom.append_child(NodeId::ROOT, html_id);
104 let head_id = dom.create_element("head");
105 dom.append_child(html_id, head_id);
106 let body_id = dom.create_element("body");
107 dom.append_child(html_id, body_id);
108
109 let mut stack = vec![body_id];
111 let mut pos = 0;
112 let bytes = html.as_bytes();
113
114 while pos < bytes.len() {
115 if bytes[pos] == b'<' {
116 let _tag_start = pos;
118 pos += 1;
119
120 if pos < bytes.len() && bytes[pos] == b'/' {
121 while pos < bytes.len() && bytes[pos] != b'>' {
123 pos += 1;
124 }
125 if pos < bytes.len() {
126 pos += 1;
127 }
128 if stack.len() > 1 {
129 stack.pop();
130 }
131 } else if pos < bytes.len() && bytes[pos] == b'!' {
132 while pos < bytes.len() && bytes[pos] != b'>' {
134 pos += 1;
135 }
136 if pos < bytes.len() {
137 pos += 1;
138 }
139 } else {
140 let mut tag_end = pos;
142 while tag_end < bytes.len()
143 && bytes[tag_end] != b'>'
144 && bytes[tag_end] != b' '
145 && bytes[tag_end] != b'/'
146 {
147 tag_end += 1;
148 }
149 let tag_name =
150 String::from_utf8_lossy(&bytes[pos..tag_end]).to_ascii_lowercase();
151
152 if tag_name.is_empty() {
153 pos = tag_end;
154 continue;
155 }
156
157 let mut attrs = HashMap::new();
159 let mut attr_pos = tag_end;
160 while attr_pos < bytes.len() && bytes[attr_pos] != b'>' {
161 while attr_pos < bytes.len()
163 && (bytes[attr_pos] == b' ' || bytes[attr_pos] == b'/')
164 {
165 attr_pos += 1;
166 }
167 if attr_pos >= bytes.len() || bytes[attr_pos] == b'>' {
168 break;
169 }
170
171 let name_start = attr_pos;
173 while attr_pos < bytes.len()
174 && bytes[attr_pos] != b'='
175 && bytes[attr_pos] != b' '
176 && bytes[attr_pos] != b'>'
177 && bytes[attr_pos] != b'/'
178 {
179 attr_pos += 1;
180 }
181 let name = String::from_utf8_lossy(&bytes[name_start..attr_pos])
182 .to_ascii_lowercase();
183
184 if attr_pos < bytes.len() && bytes[attr_pos] == b'=' {
185 attr_pos += 1;
186 let value = if attr_pos < bytes.len()
188 && (bytes[attr_pos] == b'"' || bytes[attr_pos] == b'\'')
189 {
190 let quote = bytes[attr_pos];
191 attr_pos += 1;
192 let val_start = attr_pos;
193 while attr_pos < bytes.len() && bytes[attr_pos] != quote {
194 attr_pos += 1;
195 }
196 let v = String::from_utf8_lossy(&bytes[val_start..attr_pos])
197 .into_owned();
198 if attr_pos < bytes.len() {
199 attr_pos += 1;
200 }
201 v
202 } else {
203 let val_start = attr_pos;
204 while attr_pos < bytes.len()
205 && bytes[attr_pos] != b' '
206 && bytes[attr_pos] != b'>'
207 {
208 attr_pos += 1;
209 }
210 String::from_utf8_lossy(&bytes[val_start..attr_pos]).into_owned()
211 };
212 if !name.is_empty() {
213 attrs.insert(name, value);
214 }
215 } else if !name.is_empty() {
216 attrs.insert(name, String::new());
217 }
218 }
219
220 while attr_pos < bytes.len() && bytes[attr_pos] != b'>' {
222 attr_pos += 1;
223 }
224 if attr_pos < bytes.len() {
225 attr_pos += 1;
226 }
227 pos = attr_pos;
228
229 let node_id = dom.create_element(&tag_name);
231 for (k, v) in &attrs {
232 dom.set_attribute(node_id, k, v);
233 }
234
235 let parent = *stack.last().unwrap_or(&body_id);
236 dom.append_child(parent, node_id);
237
238 let self_closing = matches!(
240 tag_name.as_str(),
241 "br" | "hr"
242 | "img"
243 | "input"
244 | "meta"
245 | "link"
246 | "area"
247 | "base"
248 | "col"
249 | "embed"
250 | "source"
251 | "track"
252 | "wbr"
253 );
254 if !self_closing {
255 stack.push(node_id);
256 }
257
258 if tag_name == "script" {
260 if attrs.contains_key("src") {
263 let close_tag = b"</script>";
264 while pos + close_tag.len() <= bytes.len() {
265 if bytes[pos..pos + close_tag.len()].eq_ignore_ascii_case(close_tag)
266 {
267 pos += close_tag.len();
268 while pos < bytes.len() && bytes[pos] != b'>' {
269 pos += 1;
270 }
271 if pos < bytes.len() {
272 pos += 1;
273 }
274 break;
275 }
276 pos += 1;
277 }
278 if stack.last() == Some(&node_id) {
279 stack.pop();
280 }
281 } else {
282 let content_start = pos;
283 let close_tag = b"</script>";
284 while pos + close_tag.len() <= bytes.len() {
285 if bytes[pos..pos + close_tag.len()].eq_ignore_ascii_case(close_tag)
286 {
287 break;
288 }
289 pos += 1;
290 }
291 let script_content =
292 String::from_utf8_lossy(&bytes[content_start..pos]).into_owned();
293 if !script_content.trim().is_empty() {
294 let text_id = dom.create_text(&script_content);
295 dom.append_child(node_id, text_id);
296 }
297 if pos + close_tag.len() <= bytes.len() {
299 pos += close_tag.len();
300 while pos < bytes.len() && bytes[pos] != b'>' {
301 pos += 1;
302 }
303 if pos < bytes.len() {
304 pos += 1;
305 }
306 }
307 if stack.last() == Some(&node_id) {
308 stack.pop();
309 }
310 } } if tag_name == "style" {
315 let content_start = pos;
316 let close_tag = b"</style>";
317 while pos + close_tag.len() <= bytes.len() {
318 if bytes[pos..pos + close_tag.len()].eq_ignore_ascii_case(close_tag) {
319 break;
320 }
321 pos += 1;
322 }
323 let style_content =
324 String::from_utf8_lossy(&bytes[content_start..pos]).into_owned();
325 if !style_content.trim().is_empty() {
326 let text_id = dom.create_text(&style_content);
327 dom.append_child(node_id, text_id);
328 }
329 if pos + close_tag.len() <= bytes.len() {
330 pos += close_tag.len();
331 while pos < bytes.len() && bytes[pos] != b'>' {
332 pos += 1;
333 }
334 if pos < bytes.len() {
335 pos += 1;
336 }
337 }
338 if stack.last() == Some(&node_id) {
339 stack.pop();
340 }
341 }
342 }
343 } else {
344 let text_start = pos;
346 while pos < bytes.len() && bytes[pos] != b'<' {
347 pos += 1;
348 }
349 let text = String::from_utf8_lossy(&bytes[text_start..pos]).into_owned();
350 if !text.trim().is_empty() {
351 let text_id = dom.create_text(&text);
352 let parent = *stack.last().unwrap_or(&body_id);
353 dom.append_child(parent, text_id);
354 }
355 }
356 }
357
358 dom
359 }
360
361 pub fn create_element(&mut self, tag: &str) -> NodeId {
363 let id = NodeId(self.nodes.len() as u32);
364 let tag_lower = tag.to_ascii_lowercase();
365 self.nodes.push(Node {
366 id,
367 kind: NodeKind::Element {
368 tag: tag_lower.clone(),
369 },
370 parent: None,
371 children: Vec::new(),
372 attributes: HashMap::new(),
373 style: HashMap::new(),
374 classes: Vec::new(),
375 });
376 self.tag_index.entry(tag_lower).or_default().push(id);
377 id
378 }
379
380 pub fn create_text(&mut self, content: &str) -> NodeId {
382 let id = NodeId(self.nodes.len() as u32);
383 self.nodes.push(Node {
384 id,
385 kind: NodeKind::Text {
386 content: content.to_string(),
387 },
388 parent: None,
389 children: Vec::new(),
390 attributes: HashMap::new(),
391 style: HashMap::new(),
392 classes: Vec::new(),
393 });
394 id
395 }
396
397 pub fn append_child(&mut self, parent: NodeId, child: NodeId) {
399 if let Some(old_parent) = self.nodes[child.0 as usize].parent {
400 self.nodes[old_parent.0 as usize]
401 .children
402 .retain(|c| *c != child);
403 }
404 self.nodes[child.0 as usize].parent = Some(parent);
405 self.nodes[parent.0 as usize].children.push(child);
406 }
407
408 pub fn set_attribute(&mut self, node: NodeId, name: &str, value: &str) {
410 let name_lower = name.to_ascii_lowercase();
411 self.nodes[node.0 as usize]
412 .attributes
413 .insert(name_lower.clone(), value.to_string());
414
415 if name_lower == "id" {
417 self.id_index.insert(value.to_string(), node);
418 }
419
420 if name_lower == "class" {
422 self.nodes[node.0 as usize].classes =
423 value.split_whitespace().map(|s| s.to_string()).collect();
424 }
425 }
426
427 pub fn get_attribute(&self, node: NodeId, name: &str) -> Option<&str> {
429 self.nodes
430 .get(node.0 as usize)?
431 .attributes
432 .get(&name.to_ascii_lowercase())
433 .map(|s| s.as_str())
434 }
435
436 pub fn remove_attribute(&mut self, node: NodeId, name: &str) {
438 let name_lower = name.to_ascii_lowercase();
439 if let Some(node) = self.nodes.get_mut(node.0 as usize) {
440 if name_lower == "id"
441 && let Some(old_id) = node.attributes.get("id")
442 {
443 self.id_index.remove(old_id);
444 }
445 node.attributes.remove(&name_lower);
446 }
447 }
448
449 pub fn get(&self, id: NodeId) -> Option<&Node> {
451 self.nodes.get(id.0 as usize)
452 }
453
454 pub fn get_mut(&mut self, id: NodeId) -> Option<&mut Node> {
456 self.nodes.get_mut(id.0 as usize)
457 }
458
459 pub fn get_element_by_id(&self, id: &str) -> Option<NodeId> {
461 self.id_index.get(id).copied()
462 }
463
464 pub fn get_elements_by_tag(&self, tag: &str) -> Vec<NodeId> {
466 self.tag_index
467 .get(&tag.to_ascii_lowercase())
468 .cloned()
469 .unwrap_or_default()
470 }
471
472 pub fn query_selector(&self, selector: &str) -> Option<NodeId> {
479 self.query_selector_all(selector).into_iter().next()
480 }
481
482 pub fn query_selector_all(&self, selector: &str) -> Vec<NodeId> {
483 let mut results = Vec::new();
484 for node in &self.nodes {
485 if node.is_element() && self.matches_selector(node, selector) {
486 results.push(node.id);
487 }
488 }
489 results
490 }
491
492 fn matches_selector(&self, node: &Node, selector: &str) -> bool {
493 let selector = selector.trim();
494
495 if let Some(id) = selector.strip_prefix('#') {
496 return node.attributes.get("id").is_some_and(|v| v == id);
497 }
498
499 if let Some(class) = selector.strip_prefix('.') {
500 return node.classes.contains(&class.to_string());
501 }
502
503 if selector.starts_with('[')
504 && let Some(end) = selector.find(']')
505 {
506 let attr_spec = &selector[1..end];
507 if let Some((name, value)) = attr_spec.split_once('=') {
508 let value = value.trim_matches('"').trim_matches('\'');
509 return node
510 .attributes
511 .get(&name.to_ascii_lowercase())
512 .is_some_and(|v| v == value);
513 } else {
514 return node
515 .attributes
516 .contains_key(&attr_spec.to_ascii_lowercase());
517 }
518 }
519
520 if let Some((tag, rest)) = selector.split_once('.') {
522 return node.tag() == tag.to_ascii_lowercase()
523 && node.classes.contains(&rest.to_string());
524 }
525 if let Some((tag, rest)) = selector.split_once('[') {
526 let tag_match = tag.is_empty() || node.tag() == tag.to_ascii_lowercase();
527 if !tag_match {
528 return false;
529 }
530 let attr_selector = format!("[{rest}");
531 return self.matches_selector(node, &attr_selector);
532 }
533
534 node.tag() == selector.to_ascii_lowercase()
535 }
536
537 pub fn extract_inline_scripts(&self) -> Vec<String> {
539 let mut scripts = Vec::new();
540 for script_id in self.get_elements_by_tag("script") {
541 let node = &self.nodes[script_id.0 as usize];
542 if node.attributes.contains_key("src") {
544 continue;
545 }
546 for child_id in &node.children {
548 if let NodeKind::Text { content } = &self.nodes[child_id.0 as usize].kind
549 && !content.trim().is_empty()
550 {
551 scripts.push(content.clone());
552 }
553 }
554 }
555 scripts
556 }
557
558 pub fn extract_script_urls(&self) -> Vec<String> {
560 self.get_elements_by_tag("script")
561 .iter()
562 .filter_map(|id| self.nodes[id.0 as usize].attributes.get("src").cloned())
563 .collect()
564 }
565
566 pub fn extract_form_actions(&self) -> Vec<String> {
568 self.get_elements_by_tag("form")
569 .iter()
570 .filter_map(|id| self.nodes[id.0 as usize].attributes.get("action").cloned())
571 .collect()
572 }
573
574 pub fn extract_password_inputs(&self) -> Vec<NodeId> {
576 self.get_elements_by_tag("input")
577 .into_iter()
578 .filter(|id| {
579 self.nodes[id.0 as usize]
580 .attributes
581 .get("type")
582 .is_some_and(|t| t.eq_ignore_ascii_case("password"))
583 })
584 .collect()
585 }
586
587 pub fn node_count(&self) -> usize {
589 self.nodes.len()
590 }
591
592 pub fn body(&self) -> Option<NodeId> {
594 self.get_elements_by_tag("body").into_iter().next()
595 }
596
597 pub fn head(&self) -> Option<NodeId> {
599 self.get_elements_by_tag("head").into_iter().next()
600 }
601
602 pub fn set_inner_html(&mut self, node: NodeId, html: &str) {
605 let idx = node.0 as usize;
606 if idx >= self.nodes.len() {
607 return; }
609 self.nodes[idx].children.clear();
611
612 let fragment = Dom::parse(html);
614 if let Some(frag_body) = fragment.body()
616 && let Some(frag_node) = fragment.nodes.get(frag_body.0 as usize)
617 {
618 let child_ids: Vec<NodeId> = frag_node.children.clone();
619 for child_id in child_ids {
620 let new_id = self.import_node(&fragment, child_id);
621 self.nodes[idx].children.push(new_id);
622 }
623 }
624 }
625
626 fn import_node(&mut self, source: &Dom, source_id: NodeId) -> NodeId {
628 let src_idx = source_id.0 as usize;
629 let Some(source_node) = source.nodes.get(src_idx) else {
630 let id = NodeId(self.nodes.len() as u32);
632 self.nodes.push(Node {
633 id,
634 kind: NodeKind::Text {
635 content: String::new(),
636 },
637 parent: None,
638 children: Vec::new(),
639 attributes: HashMap::new(),
640 style: HashMap::new(),
641 classes: Vec::new(),
642 });
643 return id;
644 };
645 let new_id = NodeId(self.nodes.len() as u32);
646 let mut new_node = source_node.clone();
647 new_node.id = new_id;
648 new_node.children = Vec::new();
649 self.nodes.push(new_node);
650
651 let child_ids: Vec<NodeId> = source
653 .nodes
654 .get(src_idx)
655 .map(|n| n.children.clone())
656 .unwrap_or_default();
657 for child_id in child_ids {
658 let imported = self.import_node(source, child_id);
659 self.nodes[new_id.0 as usize].children.push(imported);
660 }
661
662 new_id
663 }
664
665 pub fn inner_html(&self, node: NodeId) -> String {
667 let n = &self.nodes[node.0 as usize];
668 let mut html = String::new();
669 for child in &n.children {
670 self.serialize_node(*child, &mut html);
671 }
672 html
673 }
674
675 fn serialize_node(&self, id: NodeId, out: &mut String) {
676 let node = &self.nodes[id.0 as usize];
677 match &node.kind {
678 NodeKind::Text { content } => out.push_str(content),
679 NodeKind::Comment { content } => {
680 out.push_str("<!--");
681 out.push_str(content);
682 out.push_str("-->");
683 }
684 NodeKind::Element { tag } => {
685 out.push('<');
686 out.push_str(tag);
687 for (k, v) in &node.attributes {
688 out.push(' ');
689 out.push_str(k);
690 out.push_str("=\"");
691 out.push_str(v);
692 out.push('"');
693 }
694 out.push('>');
695 for child in &node.children {
696 self.serialize_node(*child, out);
697 }
698 out.push_str("</");
699 out.push_str(tag);
700 out.push('>');
701 }
702 NodeKind::Document => {
703 for child in &node.children {
704 self.serialize_node(*child, out);
705 }
706 }
707 }
708 }
709}
710
711#[cfg(test)]
712mod tests {
713 use super::*;
714
715 #[test]
716 fn parse_simple_html() {
717 let dom = Dom::parse("<html><body><div id='test'>hello</div></body></html>");
718 assert!(dom.get_element_by_id("test").is_some());
719 assert_eq!(dom.get_elements_by_tag("div").len(), 1);
720 }
721
722 #[test]
723 fn parse_form_with_password() {
724 let dom = Dom::parse(
725 r#"<form action="/login"><input type="text" name="user"><input type="password" name="pass"></form>"#,
726 );
727 assert_eq!(dom.extract_form_actions(), vec!["/login"]);
728 assert_eq!(dom.extract_password_inputs().len(), 1);
729 }
730
731 #[test]
732 fn parse_inline_script() {
733 let dom = Dom::parse("<script>alert(1)</script>");
734 let inline = dom.extract_inline_scripts();
735 assert_eq!(inline.len(), 1, "inline: {inline:?}");
736 assert!(inline[0].contains("alert(1)"));
737 }
738
739 #[test]
740 fn parse_external_script() {
741 let dom = Dom::parse(r#"<script src="ext.js"></script>"#);
742 let urls = dom.extract_script_urls();
743 assert_eq!(urls, vec!["ext.js"], "urls: {urls:?}");
744 assert!(dom.extract_inline_scripts().is_empty());
746 }
747
748 #[test]
749 fn parse_mixed_scripts() {
750 let dom = Dom::parse(
751 r#"<div><script>var x=1;</script></div><div><script>var y=2;</script></div>"#,
752 );
753 let inline = dom.extract_inline_scripts();
754 assert_eq!(inline.len(), 2, "inline: {inline:?}");
755 }
756
757 #[test]
758 fn query_selector_by_class() {
759 let dom = Dom::parse(r#"<div class="a b"><span class="b c">x</span></div>"#);
760 assert_eq!(dom.query_selector_all(".b").len(), 2);
761 }
762
763 #[test]
764 fn query_selector_by_attribute() {
765 let dom = Dom::parse(r#"<input type="password"><input type="text">"#);
766 let results = dom.query_selector_all("input[type=password]");
767 assert_eq!(results.len(), 1);
768 }
769
770 #[test]
771 fn create_and_append() {
772 let mut dom = Dom::empty();
773 let body = dom.create_element("body");
774 dom.append_child(NodeId::ROOT, body);
775 let div = dom.create_element("div");
776 dom.set_attribute(div, "id", "new");
777 dom.append_child(body, div);
778 assert!(dom.get_element_by_id("new").is_some());
779 }
780
781 #[test]
782 fn inner_html_serialization() {
783 let dom = Dom::parse("<div><span>hello</span></div>");
784 let divs = dom.get_elements_by_tag("div");
785 assert!(!divs.is_empty());
786 let html = dom.inner_html(divs[0]);
787 assert!(html.contains("<span>"));
788 assert!(html.contains("hello"));
789 }
790}