Skip to main content

cssbox_dom/
html.rs

1//! HTML parsing for the layout engine.
2//!
3//! Provides a simple HTML parser sufficient for WPT test files and common HTML.
4//! For production use, this would be replaced with a full html5ever integration.
5
6use std::collections::HashMap;
7
8use crate::dom::{DomNodeId, DomTree};
9
10/// Parse an HTML document string into a DOM tree.
11pub fn parse_html(html: &str) -> DomTree {
12    parse_html_simple(html)
13}
14
15/// Simple recursive HTML parser for test cases and common HTML.
16pub fn parse_html_simple(html: &str) -> DomTree {
17    let mut tree = DomTree::new();
18    let root = tree.root();
19    let html_el = tree.add_element(root, "html", HashMap::new());
20    parse_children(&mut tree, html_el, html);
21    tree
22}
23
24/// Simple recursive HTML parser.
25fn parse_children(tree: &mut DomTree, parent: DomNodeId, html: &str) {
26    let mut pos = 0;
27    let bytes = html.as_bytes();
28    let len = bytes.len();
29
30    while pos < len {
31        if bytes[pos] == b'<' {
32            // Check for closing tag
33            if pos + 1 < len && bytes[pos + 1] == b'/' {
34                // Skip closing tag — return to parent parser
35                return;
36            }
37
38            // Check for comment
39            if html[pos..].starts_with("<!--") {
40                if let Some(end) = html[pos..].find("-->") {
41                    pos += end + 3;
42                    continue;
43                }
44            }
45
46            // Check for <!DOCTYPE
47            if html[pos..].starts_with("<!") {
48                while pos < len && bytes[pos] != b'>' {
49                    pos += 1;
50                }
51                pos += 1;
52                continue;
53            }
54
55            // Parse opening tag
56            pos += 1; // skip '<'
57            let tag_start = pos;
58
59            // Get tag name
60            while pos < len && bytes[pos] != b' ' && bytes[pos] != b'>' && bytes[pos] != b'/' {
61                pos += 1;
62            }
63            let tag_name = html[tag_start..pos].to_string();
64
65            // Skip <script> and <link> tags entirely
66            if tag_name.eq_ignore_ascii_case("script") {
67                if let Some(end) = find_tag_end(html, pos, &tag_name) {
68                    pos = end;
69                    continue;
70                }
71            }
72
73            // Parse attributes
74            let mut attrs = HashMap::new();
75            while pos < len && bytes[pos] != b'>' && bytes[pos] != b'/' {
76                // Skip whitespace
77                while pos < len && bytes[pos] == b' ' {
78                    pos += 1;
79                }
80                if pos >= len || bytes[pos] == b'>' || bytes[pos] == b'/' {
81                    break;
82                }
83
84                // Attribute name
85                let attr_start = pos;
86                while pos < len
87                    && bytes[pos] != b'='
88                    && bytes[pos] != b' '
89                    && bytes[pos] != b'>'
90                    && bytes[pos] != b'/'
91                {
92                    pos += 1;
93                }
94                let attr_name = html[attr_start..pos].to_string();
95
96                // Check for = and value
97                if pos < len && bytes[pos] == b'=' {
98                    pos += 1; // skip '='
99                    let value = if pos < len && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
100                        let quote = bytes[pos];
101                        pos += 1;
102                        let val_start = pos;
103                        while pos < len && bytes[pos] != quote {
104                            pos += 1;
105                        }
106                        let val = html[val_start..pos].to_string();
107                        if pos < len {
108                            pos += 1; // skip closing quote
109                        }
110                        val
111                    } else {
112                        let val_start = pos;
113                        while pos < len && bytes[pos] != b' ' && bytes[pos] != b'>' {
114                            pos += 1;
115                        }
116                        html[val_start..pos].to_string()
117                    };
118                    if !attr_name.is_empty() {
119                        attrs.insert(attr_name, value);
120                    }
121                } else if !attr_name.is_empty() {
122                    attrs.insert(attr_name, String::new());
123                }
124            }
125
126            // Check for self-closing
127            let self_closing = pos < len && bytes[pos] == b'/';
128            if self_closing {
129                pos += 1;
130            }
131            if pos < len && bytes[pos] == b'>' {
132                pos += 1;
133            }
134
135            let is_void = is_void_element(&tag_name);
136
137            let node = tree.add_element(parent, &tag_name, attrs);
138
139            if !self_closing && !is_void {
140                // Parse children
141                let _child_start = pos;
142                parse_children(tree, node, &html[pos..]);
143                // Skip past closing tag
144                let close_tag = format!("</{}>", tag_name);
145                let close_tag_lower = close_tag.to_lowercase();
146                // Try case-insensitive match
147                if let Some(close_pos) = html[pos..].to_lowercase().find(&close_tag_lower) {
148                    pos += close_pos + close_tag.len();
149                } else {
150                    // No closing tag found
151                    break;
152                }
153            }
154        } else {
155            // Text content
156            let text_start = pos;
157            while pos < len && bytes[pos] != b'<' {
158                pos += 1;
159            }
160            let text = html[text_start..pos].trim();
161            if !text.is_empty() {
162                tree.add_text(parent, text);
163            }
164        }
165    }
166}
167
168fn is_void_element(tag: &str) -> bool {
169    matches!(
170        tag.to_lowercase().as_str(),
171        "area"
172            | "base"
173            | "br"
174            | "col"
175            | "embed"
176            | "hr"
177            | "img"
178            | "input"
179            | "link"
180            | "meta"
181            | "param"
182            | "source"
183            | "track"
184            | "wbr"
185    )
186}
187
188fn find_tag_end(html: &str, start: usize, tag_name: &str) -> Option<usize> {
189    let close_tag = format!("</{}>", tag_name);
190    let lower = html[start..].to_lowercase();
191    let close_lower = close_tag.to_lowercase();
192    if let Some(pos) = lower.find(&close_lower) {
193        Some(start + pos + close_tag.len())
194    } else {
195        // Self-closing or void — find the '>'
196        html[start..].find('>').map(|pos| start + pos + 1)
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn test_parse_simple_html() {
206        let html = r#"<div><p>Hello</p></div>"#;
207        let tree = parse_html_simple(html);
208
209        let div = tree.find_element_by_tag("div").unwrap();
210        let _p = tree.find_element_by_tag("p").unwrap();
211        assert!(!tree.children(div).is_empty());
212    }
213
214    #[test]
215    fn test_parse_with_attributes() {
216        let html = r#"<div id="test" class="box" style="width: 100px"></div>"#;
217        let tree = parse_html_simple(html);
218        let div = tree.find_element_by_tag("div").unwrap();
219        let node = tree.node(div);
220        assert_eq!(node.get_attribute("id"), Some("test"));
221        assert_eq!(node.get_attribute("class"), Some("box"));
222        assert_eq!(node.get_attribute("style"), Some("width: 100px"));
223    }
224
225    #[test]
226    fn test_parse_nested() {
227        let html = r#"<div><span>text</span><p>paragraph</p></div>"#;
228        let tree = parse_html_simple(html);
229        let div = tree.find_element_by_tag("div").unwrap();
230        assert_eq!(tree.children(div).len(), 2);
231    }
232
233    #[test]
234    fn test_parse_with_style_tag() {
235        let html = r#"<style>.box { width: 100px; }</style><div class="box"></div>"#;
236        let tree = parse_html_simple(html);
237        let style = tree.find_element_by_tag("style").unwrap();
238        let _div = tree.find_element_by_tag("div").unwrap();
239        assert!(!tree.children(style).is_empty()); // text child with CSS
240    }
241
242    #[test]
243    fn test_parse_void_elements() {
244        let html = r#"<div><br><hr><img src="test.png"></div>"#;
245        let tree = parse_html_simple(html);
246        let div = tree.find_element_by_tag("div").unwrap();
247        assert_eq!(tree.children(div).len(), 3);
248    }
249}