1use std::collections::HashMap;
7
8use crate::dom::{DomNodeId, DomTree};
9
10pub fn parse_html(html: &str) -> DomTree {
12 parse_html_simple(html)
13}
14
15pub fn parse_html_simple(html: &str) -> DomTree {
17 let mut tree = DomTree::new();
18 let root = tree.root();
19 let html_el = tree.add_element(root, "html", HashMap::new());
20 parse_children(&mut tree, html_el, html);
21 tree
22}
23
24fn parse_children(tree: &mut DomTree, parent: DomNodeId, html: &str) {
26 let mut pos = 0;
27 let bytes = html.as_bytes();
28 let len = bytes.len();
29
30 while pos < len {
31 if bytes[pos] == b'<' {
32 if pos + 1 < len && bytes[pos + 1] == b'/' {
34 return;
36 }
37
38 if html[pos..].starts_with("<!--") {
40 if let Some(end) = html[pos..].find("-->") {
41 pos += end + 3;
42 continue;
43 }
44 }
45
46 if html[pos..].starts_with("<!") {
48 while pos < len && bytes[pos] != b'>' {
49 pos += 1;
50 }
51 pos += 1;
52 continue;
53 }
54
55 pos += 1; let tag_start = pos;
58
59 while pos < len && bytes[pos] != b' ' && bytes[pos] != b'>' && bytes[pos] != b'/' {
61 pos += 1;
62 }
63 let tag_name = html[tag_start..pos].to_string();
64
65 if tag_name.eq_ignore_ascii_case("script") {
67 if let Some(end) = find_tag_end(html, pos, &tag_name) {
68 pos = end;
69 continue;
70 }
71 }
72
73 let mut attrs = HashMap::new();
75 while pos < len && bytes[pos] != b'>' && bytes[pos] != b'/' {
76 while pos < len && bytes[pos] == b' ' {
78 pos += 1;
79 }
80 if pos >= len || bytes[pos] == b'>' || bytes[pos] == b'/' {
81 break;
82 }
83
84 let attr_start = pos;
86 while pos < len
87 && bytes[pos] != b'='
88 && bytes[pos] != b' '
89 && bytes[pos] != b'>'
90 && bytes[pos] != b'/'
91 {
92 pos += 1;
93 }
94 let attr_name = html[attr_start..pos].to_string();
95
96 if pos < len && bytes[pos] == b'=' {
98 pos += 1; let value = if pos < len && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
100 let quote = bytes[pos];
101 pos += 1;
102 let val_start = pos;
103 while pos < len && bytes[pos] != quote {
104 pos += 1;
105 }
106 let val = html[val_start..pos].to_string();
107 if pos < len {
108 pos += 1; }
110 val
111 } else {
112 let val_start = pos;
113 while pos < len && bytes[pos] != b' ' && bytes[pos] != b'>' {
114 pos += 1;
115 }
116 html[val_start..pos].to_string()
117 };
118 if !attr_name.is_empty() {
119 attrs.insert(attr_name, value);
120 }
121 } else if !attr_name.is_empty() {
122 attrs.insert(attr_name, String::new());
123 }
124 }
125
126 let self_closing = pos < len && bytes[pos] == b'/';
128 if self_closing {
129 pos += 1;
130 }
131 if pos < len && bytes[pos] == b'>' {
132 pos += 1;
133 }
134
135 let is_void = is_void_element(&tag_name);
136
137 let node = tree.add_element(parent, &tag_name, attrs);
138
139 if !self_closing && !is_void {
140 let _child_start = pos;
142 parse_children(tree, node, &html[pos..]);
143 let close_tag = format!("</{}>", tag_name);
145 let close_tag_lower = close_tag.to_lowercase();
146 if let Some(close_pos) = html[pos..].to_lowercase().find(&close_tag_lower) {
148 pos += close_pos + close_tag.len();
149 } else {
150 break;
152 }
153 }
154 } else {
155 let text_start = pos;
157 while pos < len && bytes[pos] != b'<' {
158 pos += 1;
159 }
160 let text = html[text_start..pos].trim();
161 if !text.is_empty() {
162 tree.add_text(parent, text);
163 }
164 }
165 }
166}
167
168fn is_void_element(tag: &str) -> bool {
169 matches!(
170 tag.to_lowercase().as_str(),
171 "area"
172 | "base"
173 | "br"
174 | "col"
175 | "embed"
176 | "hr"
177 | "img"
178 | "input"
179 | "link"
180 | "meta"
181 | "param"
182 | "source"
183 | "track"
184 | "wbr"
185 )
186}
187
188fn find_tag_end(html: &str, start: usize, tag_name: &str) -> Option<usize> {
189 let close_tag = format!("</{}>", tag_name);
190 let lower = html[start..].to_lowercase();
191 let close_lower = close_tag.to_lowercase();
192 if let Some(pos) = lower.find(&close_lower) {
193 Some(start + pos + close_tag.len())
194 } else {
195 html[start..].find('>').map(|pos| start + pos + 1)
197 }
198}
199
200#[cfg(test)]
201mod tests {
202 use super::*;
203
204 #[test]
205 fn test_parse_simple_html() {
206 let html = r#"<div><p>Hello</p></div>"#;
207 let tree = parse_html_simple(html);
208
209 let div = tree.find_element_by_tag("div").unwrap();
210 let _p = tree.find_element_by_tag("p").unwrap();
211 assert!(!tree.children(div).is_empty());
212 }
213
214 #[test]
215 fn test_parse_with_attributes() {
216 let html = r#"<div id="test" class="box" style="width: 100px"></div>"#;
217 let tree = parse_html_simple(html);
218 let div = tree.find_element_by_tag("div").unwrap();
219 let node = tree.node(div);
220 assert_eq!(node.get_attribute("id"), Some("test"));
221 assert_eq!(node.get_attribute("class"), Some("box"));
222 assert_eq!(node.get_attribute("style"), Some("width: 100px"));
223 }
224
225 #[test]
226 fn test_parse_nested() {
227 let html = r#"<div><span>text</span><p>paragraph</p></div>"#;
228 let tree = parse_html_simple(html);
229 let div = tree.find_element_by_tag("div").unwrap();
230 assert_eq!(tree.children(div).len(), 2);
231 }
232
233 #[test]
234 fn test_parse_with_style_tag() {
235 let html = r#"<style>.box { width: 100px; }</style><div class="box"></div>"#;
236 let tree = parse_html_simple(html);
237 let style = tree.find_element_by_tag("style").unwrap();
238 let _div = tree.find_element_by_tag("div").unwrap();
239 assert!(!tree.children(style).is_empty()); }
241
242 #[test]
243 fn test_parse_void_elements() {
244 let html = r#"<div><br><hr><img src="test.png"></div>"#;
245 let tree = parse_html_simple(html);
246 let div = tree.find_element_by_tag("div").unwrap();
247 assert_eq!(tree.children(div).len(), 3);
248 }
249}