1use crate::types::DOMNodeInfo;
2
3#[derive(Debug, Clone)]
4pub struct DomNode {
5 pub tag: String,
6 pub id: Option<String>,
7 pub classes: Vec<String>,
8 pub start: usize,
9 pub end: usize,
10 pub parent: Option<usize>,
11 pub children: Vec<usize>,
12}
13
14#[derive(Debug, Clone)]
15pub struct DomTree {
16 nodes: Vec<DomNode>,
17 roots: Vec<usize>,
18}
19
20#[derive(Debug, Clone)]
21pub struct StyleBlock {
22 pub content: String,
23 pub content_start: usize,
24}
25
26#[derive(Debug, Clone)]
27pub struct InlineStyle {
28 pub value: String,
29 pub value_start: usize,
30 pub attribute_start: usize,
31}
32
33#[derive(Debug, Clone)]
34pub struct HtmlParseResult {
35 pub dom_tree: DomTree,
36 pub style_blocks: Vec<StyleBlock>,
37 pub inline_styles: Vec<InlineStyle>,
38}
39
40impl DomTree {
41 pub fn parse(html: &str) -> HtmlParseResult {
42 let bytes = html.as_bytes();
43 let len = bytes.len();
44 let mut i = 0;
45 let mut comment_depth = 0usize;
46 let mut nodes: Vec<DomNode> = Vec::new();
47 let mut roots: Vec<usize> = Vec::new();
48 let mut stack: Vec<usize> = Vec::new();
49 let mut style_blocks = Vec::new();
50 let mut inline_styles = Vec::new();
51
52 while i < len {
53 if comment_depth > 0 {
54 if starts_with(bytes, i, b"<!--") {
55 comment_depth += 1;
56 i += 4;
57 continue;
58 }
59 if starts_with(bytes, i, b"-->") {
60 comment_depth -= 1;
61 i += 3;
62 continue;
63 }
64 i += 1;
65 continue;
66 }
67
68 if starts_with(bytes, i, b"<!--") {
69 comment_depth = 1;
70 i += 4;
71 continue;
72 }
73
74 if bytes[i] != b'<' {
75 i += 1;
76 continue;
77 }
78
79 if i + 1 >= len {
80 break;
81 }
82
83 if bytes[i + 1] == b'/' {
84 if let Some((tag_name, end_pos)) = parse_end_tag(html, i) {
86 let mut match_index = None;
87 for (pos, node_idx) in stack.iter().enumerate().rev() {
88 if nodes[*node_idx].tag == tag_name {
89 match_index = Some(pos);
90 break;
91 }
92 }
93 if let Some(pos) = match_index {
94 let node_idx = stack[pos];
95 nodes[node_idx].end = end_pos;
96 stack.truncate(pos);
97 }
98 i = end_pos;
99 continue;
100 }
101 }
102
103 if bytes[i + 1] == b'!' {
104 if let Some(end_pos) = find_char(bytes, i + 2, b'>') {
106 i = end_pos + 1;
107 continue;
108 }
109 }
110
111 let tag_start = i;
112 i += 1;
113 while i < len && bytes[i].is_ascii_whitespace() {
114 i += 1;
115 }
116 let name_start = i;
117 while i < len && is_tag_name_char(bytes[i]) {
118 i += 1;
119 }
120 if name_start == i {
121 i += 1;
122 continue;
123 }
124 let tag_name = html[name_start..i].to_lowercase();
125
126 let mut id: Option<String> = None;
127 let mut classes: Vec<String> = Vec::new();
128 let mut self_closing = false;
129
130 while i < len {
131 while i < len && bytes[i].is_ascii_whitespace() {
132 i += 1;
133 }
134 if i >= len {
135 break;
136 }
137 if bytes[i] == b'/' && i + 1 < len && bytes[i + 1] == b'>' {
138 self_closing = true;
139 i += 2;
140 break;
141 }
142 if bytes[i] == b'>' {
143 i += 1;
144 break;
145 }
146
147 let attr_name_start = i;
148 while i < len && is_attr_name_char(bytes[i]) {
149 i += 1;
150 }
151 if attr_name_start == i {
152 i += 1;
153 continue;
154 }
155 let attr_name = html[attr_name_start..i].to_lowercase();
156 while i < len && bytes[i].is_ascii_whitespace() {
157 i += 1;
158 }
159
160 let mut value: Option<String> = None;
161 let mut value_start = None;
162 if i < len && bytes[i] == b'=' {
163 i += 1;
164 while i < len && bytes[i].is_ascii_whitespace() {
165 i += 1;
166 }
167 if i < len && (bytes[i] == b'"' || bytes[i] == b'\'') {
168 let quote = bytes[i];
169 i += 1;
170 let start = i;
171 while i < len && bytes[i] != quote {
172 i += 1;
173 }
174 let end = i.min(len);
175 value = Some(html[start..end].to_string());
176 value_start = Some(start);
177 if i < len {
178 i += 1;
179 }
180 } else {
181 let start = i;
182 while i < len && !bytes[i].is_ascii_whitespace() && bytes[i] != b'>' {
183 i += 1;
184 }
185 let end = i;
186 value = Some(html[start..end].to_string());
187 value_start = Some(start);
188 }
189 }
190
191 match attr_name.as_str() {
192 "id" => {
193 if let Some(v) = &value {
194 if !v.is_empty() {
195 id = Some(v.to_string());
196 }
197 }
198 }
199 "class" | "classname" => {
200 if let Some(v) = &value {
201 classes.extend(v.split_whitespace().map(|c| c.to_string()));
202 }
203 }
204 "style" => {
205 if let (Some(v), Some(v_start)) = (value.clone(), value_start) {
206 inline_styles.push(InlineStyle {
207 value: v,
208 value_start: v_start,
209 attribute_start: attr_name_start,
210 });
211 }
212 }
213 _ => {}
214 }
215 }
216
217 let tag_end = i;
218 let node_idx = nodes.len();
219 let parent = stack.last().copied();
220 nodes.push(DomNode {
221 tag: tag_name.clone(),
222 id,
223 classes,
224 start: tag_start,
225 end: tag_end,
226 parent,
227 children: Vec::new(),
228 });
229
230 if let Some(parent_idx) = parent {
231 nodes[parent_idx].children.push(node_idx);
232 } else {
233 roots.push(node_idx);
234 }
235
236 if tag_name == "style" {
237 if let Some((content_start, content_end, close_end)) =
238 find_block_content(html, tag_end, "style")
239 {
240 let content = html[content_start..content_end].to_string();
241 style_blocks.push(StyleBlock {
242 content,
243 content_start,
244 });
245 nodes[node_idx].end = close_end;
246 i = close_end;
247 continue;
248 }
249 }
250
251 if tag_name == "script" {
252 if let Some((_, _, close_end)) = find_block_content(html, tag_end, "script") {
253 nodes[node_idx].end = close_end;
254 i = close_end;
255 continue;
256 }
257 }
258
259 if self_closing || is_void_tag(&tag_name) {
260 nodes[node_idx].end = tag_end;
261 } else {
262 stack.push(node_idx);
263 }
264 }
265
266 let final_end = html.len();
267 for idx in stack {
268 if nodes[idx].end < final_end {
269 nodes[idx].end = final_end;
270 }
271 }
272
273 HtmlParseResult {
274 dom_tree: DomTree { nodes, roots },
275 style_blocks,
276 inline_styles,
277 }
278 }
279
280 pub fn find_node_at_position(&self, position: usize) -> Option<DOMNodeInfo> {
281 for &root_idx in &self.roots {
282 if let Some(info) = self.find_node_recursive(root_idx, position) {
283 return Some(info);
284 }
285 }
286 None
287 }
288
289 fn find_node_recursive(&self, idx: usize, position: usize) -> Option<DOMNodeInfo> {
290 let node = &self.nodes[idx];
291 if position < node.start || position > node.end {
292 return None;
293 }
294 for &child in &node.children {
295 if let Some(found) = self.find_node_recursive(child, position) {
296 return Some(found);
297 }
298 }
299 Some(self.to_info(idx))
300 }
301
302 pub fn matches_selector(&self, node_index: usize, selector: &str) -> bool {
303 let selector = selector.trim();
304 if selector.is_empty() {
305 return false;
306 }
307 if selector == ":root" {
308 return true;
309 }
310 let selectors: Vec<&str> = selector.split(',').map(|s| s.trim()).collect();
311 for sel in selectors {
312 if sel.is_empty() {
313 continue;
314 }
315 let parts = parse_selector_parts(sel);
316 if matches_selector_parts(self, node_index, &parts) {
317 return true;
318 }
319 }
320 false
321 }
322
323 fn to_info(&self, idx: usize) -> DOMNodeInfo {
324 let node = &self.nodes[idx];
325 DOMNodeInfo {
326 tag: node.tag.clone(),
327 id: node.id.clone(),
328 classes: node.classes.clone(),
329 position: node.start,
330 node_index: Some(idx),
331 }
332 }
333}
334
335#[derive(Debug, Clone, Copy)]
336enum Combinator {
337 Descendant,
338 Child,
339}
340
341#[derive(Debug, Clone)]
342struct SimpleSelector {
343 tag: Option<String>,
344 id: Option<String>,
345 classes: Vec<String>,
346}
347
348#[derive(Debug, Clone)]
349struct SelectorPart {
350 combinator: Combinator,
351 selector: SimpleSelector,
352}
353
354fn parse_selector_parts(selector: &str) -> Vec<SelectorPart> {
355 let mut tokens: Vec<String> = Vec::new();
356 let mut current = String::new();
357 let mut in_attr = 0usize;
358 let mut in_paren = 0usize;
359 let mut last_was_space = false;
360
361 for ch in selector.chars() {
362 match ch {
363 '[' => {
364 in_attr += 1;
365 current.push(ch);
366 last_was_space = false;
367 }
368 ']' => {
369 in_attr = in_attr.saturating_sub(1);
370 current.push(ch);
371 last_was_space = false;
372 }
373 '(' => {
374 in_paren += 1;
375 current.push(ch);
376 last_was_space = false;
377 }
378 ')' => {
379 in_paren = in_paren.saturating_sub(1);
380 current.push(ch);
381 last_was_space = false;
382 }
383 '>' if in_attr == 0 && in_paren == 0 => {
384 if !current.trim().is_empty() {
385 tokens.push(current.trim().to_string());
386 }
387 tokens.push(">".to_string());
388 current.clear();
389 last_was_space = false;
390 }
391 ch if ch.is_whitespace() && in_attr == 0 && in_paren == 0 => {
392 if !current.trim().is_empty() {
393 tokens.push(current.trim().to_string());
394 current.clear();
395 }
396 if !last_was_space {
397 tokens.push(" ".to_string());
398 last_was_space = true;
399 }
400 }
401 _ => {
402 current.push(ch);
403 last_was_space = false;
404 }
405 }
406 }
407
408 if !current.trim().is_empty() {
409 tokens.push(current.trim().to_string());
410 }
411
412 let mut parts = Vec::new();
413 let mut next_combinator = Combinator::Descendant;
414
415 for token in tokens {
416 if token == ">" {
417 next_combinator = Combinator::Child;
418 continue;
419 }
420 if token == " " {
421 next_combinator = Combinator::Descendant;
422 continue;
423 }
424 let selector = parse_simple_selector(&token);
425 parts.push(SelectorPart {
426 combinator: next_combinator,
427 selector,
428 });
429 next_combinator = Combinator::Descendant;
430 }
431
432 parts
433}
434
435fn parse_simple_selector(token: &str) -> SimpleSelector {
436 let mut tag: Option<String> = None;
437 let mut id: Option<String> = None;
438 let mut classes: Vec<String> = Vec::new();
439
440 let mut slice = token;
441 if let Some(idx) = slice.find([':', '[']) {
442 slice = &slice[..idx];
443 }
444
445 let mut current = String::new();
446 let mut mode = 't'; for ch in slice.chars() {
449 match ch {
450 '#' => {
451 if mode == 't' && !current.is_empty() && tag.is_none() {
452 tag = Some(current.clone());
453 } else if mode == 'c' && !current.is_empty() {
454 classes.push(current.clone());
455 }
456 current.clear();
457 mode = 'i';
458 }
459 '.' => {
460 if mode == 't' && !current.is_empty() && tag.is_none() {
461 tag = Some(current.clone());
462 } else if mode == 'i' && !current.is_empty() {
463 id = Some(current.clone());
464 } else if mode == 'c' && !current.is_empty() {
465 classes.push(current.clone());
466 }
467 current.clear();
468 mode = 'c';
469 }
470 _ => current.push(ch),
471 }
472 }
473
474 if !current.is_empty() {
475 match mode {
476 't' => {
477 if current != "*" {
478 tag = Some(current);
479 }
480 }
481 'i' => id = Some(current),
482 'c' => classes.push(current),
483 _ => {}
484 }
485 }
486
487 SimpleSelector { tag, id, classes }
488}
489
490fn matches_selector_parts(tree: &DomTree, node_index: usize, parts: &[SelectorPart]) -> bool {
491 if parts.is_empty() {
492 return false;
493 }
494
495 let mut current_index = Some(node_index);
496 for (idx, part) in parts.iter().enumerate().rev() {
497 let node_idx = match current_index {
498 Some(i) => i,
499 None => return false,
500 };
501 if !matches_simple_selector(&tree.nodes[node_idx], &part.selector) {
502 return false;
503 }
504
505 if idx == 0 {
506 return true;
507 }
508
509 let next_part = &parts[idx - 1];
510 match part.combinator {
511 Combinator::Child => {
512 current_index = tree.nodes[node_idx].parent;
513 if let Some(parent_idx) = current_index {
514 if !matches_simple_selector(&tree.nodes[parent_idx], &next_part.selector) {
515 return false;
516 }
517 } else {
518 return false;
519 }
520 }
521 Combinator::Descendant => {
522 let mut parent = tree.nodes[node_idx].parent;
523 let mut matched = false;
524 while let Some(parent_idx) = parent {
525 if matches_simple_selector(&tree.nodes[parent_idx], &next_part.selector) {
526 matched = true;
527 current_index = Some(parent_idx);
528 break;
529 }
530 parent = tree.nodes[parent_idx].parent;
531 }
532 if !matched {
533 return false;
534 }
535 }
536 }
537 }
538
539 true
540}
541
542fn matches_simple_selector(node: &DomNode, selector: &SimpleSelector) -> bool {
543 if let Some(tag) = &selector.tag {
544 if node.tag != tag.to_lowercase() {
545 return false;
546 }
547 }
548 if let Some(id) = &selector.id {
549 if node.id.as_deref() != Some(id) {
550 return false;
551 }
552 }
553 for class in &selector.classes {
554 if !node.classes.iter().any(|c| c == class) {
555 return false;
556 }
557 }
558 true
559}
560
561fn is_tag_name_char(b: u8) -> bool {
562 b.is_ascii_alphanumeric() || b == b'-' || b == b':'
563}
564
565fn is_attr_name_char(b: u8) -> bool {
566 b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':'
567}
568
569fn is_void_tag(tag: &str) -> bool {
570 matches!(
571 tag,
572 "area"
573 | "base"
574 | "br"
575 | "col"
576 | "embed"
577 | "hr"
578 | "img"
579 | "input"
580 | "link"
581 | "meta"
582 | "param"
583 | "source"
584 | "track"
585 | "wbr"
586 )
587}
588
589fn starts_with(bytes: &[u8], idx: usize, pattern: &[u8]) -> bool {
590 bytes.len() >= idx + pattern.len() && &bytes[idx..idx + pattern.len()] == pattern
591}
592
593fn find_char(bytes: &[u8], start: usize, target: u8) -> Option<usize> {
594 (start..bytes.len()).find(|&i| bytes[i] == target)
595}
596
597fn parse_end_tag(html: &str, start: usize) -> Option<(String, usize)> {
598 let bytes = html.as_bytes();
599 let len = bytes.len();
600 if start + 2 >= len {
601 return None;
602 }
603 let mut i = start + 2;
604 while i < len && bytes[i].is_ascii_whitespace() {
605 i += 1;
606 }
607 let name_start = i;
608 while i < len && is_tag_name_char(bytes[i]) {
609 i += 1;
610 }
611 if name_start == i {
612 return None;
613 }
614 let tag_name = html[name_start..i].to_lowercase();
615 if let Some(end_pos) = find_char(bytes, i, b'>') {
616 return Some((tag_name, end_pos + 1));
617 }
618 None
619}
620
621fn find_block_content(html: &str, start: usize, tag: &str) -> Option<(usize, usize, usize)> {
622 let lower = html.to_lowercase();
623 let bytes = lower.as_bytes();
624 let len = bytes.len();
625 let target = format!("</{}", tag.to_lowercase());
626 let target_bytes = target.as_bytes();
627 let mut i = start;
628 while i + target_bytes.len() < len {
629 if bytes[i] == b'<' && starts_with(bytes, i, target_bytes) {
630 if let Some(end_pos) = find_char(bytes, i + target_bytes.len(), b'>') {
631 return Some((start, i, end_pos + 1));
632 }
633 }
634 i += 1;
635 }
636 None
637}
638
639#[cfg(test)]
640mod tests {
641 use super::*;
642
643 #[test]
644 fn test_dom_tree_basic() {
645 let html = r#"
646 <html>
647 <body>
648 <div class="container">
649 <p id="text">Hello</p>
650 </div>
651 </body>
652 </html>
653 "#;
654
655 let result = DomTree::parse(html);
656 let tree = result.dom_tree;
657 assert!(!tree.roots.is_empty());
658
659 let node = tree.find_node_at_position(50);
661 assert!(node.is_some());
662 }
663
664 #[test]
665 fn test_dom_tree_nested_structure() {
666 let html =
667 r#"<div class="outer"><div class="inner"><span id="item">Text</span></div></div>"#;
668
669 let result = DomTree::parse(html);
670 let tree = result.dom_tree;
671
672 assert!(!tree.roots.is_empty());
674
675 assert_eq!(tree.nodes.len(), 3); }
678
679 #[test]
680 fn test_dom_tree_multiple_classes() {
681 let html = r#"<div class="class1 class2 class3">Content</div>"#;
682
683 let result = DomTree::parse(html);
684 assert!(!result.dom_tree.roots.is_empty());
685 }
687
688 #[test]
689 fn test_dom_tree_self_closing_tags() {
690 let html = r#"<div><img src="test.jpg" /><br /><input type="text" /></div>"#;
691
692 let result = DomTree::parse(html);
693 let tree = result.dom_tree;
694
695 assert!(!tree.roots.is_empty());
696 }
698
699 #[test]
700 fn test_dom_tree_find_node_at_position() {
701 let html = r#"<div class="outer"><p id="para">Text</p></div>"#;
702
703 let result = DomTree::parse(html);
704 let tree = result.dom_tree;
705
706 let node = tree.find_node_at_position(5);
708 assert!(node.is_some());
709 }
710
711 #[test]
712 fn test_dom_tree_empty_html() {
713 let html = "";
714 let result = DomTree::parse(html);
715 let tree = result.dom_tree;
716 assert!(tree.roots.is_empty());
717 }
718
719 #[test]
720 fn test_dom_tree_malformed_html() {
721 let html = r#"<div><p>Text"#;
723 let result = DomTree::parse(html);
724 assert!(!result.dom_tree.roots.is_empty());
726 }
727
728 #[test]
729 fn test_parse_inline_styles() {
730 let html = r#"<div style="color: red; background: blue;"></div>"#;
731
732 let parsed = DomTree::parse(html);
733 assert_eq!(parsed.inline_styles.len(), 1);
734
735 let inline = &parsed.inline_styles[0];
736 assert!(inline.value.contains("color: red"));
737 assert!(inline.value.contains("background: blue"));
738 }
739
740 #[test]
741 fn test_parse_style_blocks() {
742 let html = r#"
743 <html>
744 <head>
745 <style>
746 .class { color: red; }
747 </style>
748 </head>
749 <body>
750 <style>
751 #id { background: blue; }
752 </style>
753 </body>
754 </html>
755 "#;
756
757 let parsed = DomTree::parse(html);
758 assert_eq!(parsed.style_blocks.len(), 2);
759
760 assert!(parsed.style_blocks[0].content.contains("color: red"));
761 assert!(parsed.style_blocks[1].content.contains("background: blue"));
762 }
763
764 #[test]
765 fn test_parse_nested_style_tags() {
766 let html = r#"<style>outer { color: red; }<style>inner</style></style>"#;
767
768 let parsed = DomTree::parse(html);
769 assert!(!parsed.style_blocks.is_empty());
771 }
772
773 #[test]
774 fn test_dom_tree_comment_handling() {
775 let html = r#"<div><!-- This is a comment --><p>Text</p></div>"#;
776
777 let result = DomTree::parse(html);
778 let tree = result.dom_tree;
779
780 assert!(!tree.roots.is_empty());
782 }
783
784 #[test]
785 fn test_attributes_with_quotes() {
786 let html = r#"<div class="test" id='myid' data-value=unquoted></div>"#;
787
788 let result = DomTree::parse(html);
789 let tree = result.dom_tree;
790
791 assert!(!tree.roots.is_empty());
793 }
794
795 #[test]
796 fn test_dom_tree_classname_alias() {
797 let html = r#"<div className="react-class">Content</div>"#;
798 let result = DomTree::parse(html);
799 let tree = result.dom_tree;
800 assert!(!tree.roots.is_empty());
801 let node = &tree.nodes[tree.roots[0]];
802 assert!(node.classes.contains(&"react-class".to_string()));
803 }
804}